Splunk_Docker/files/splunk-etc/apps/search/bin/scrub.py

#!/usr/bin/env python
# This work contains confidential material of Splunk Inc. Its use or disclosure in
#whole or in part without the express written permission of Splunk Inc. is prohibited.

import os
import re
import random
import glob
import urllib
import sys
import splunk.Intersplunk
import splunk.clilib.cli_common as comm

from builtins import chr, range
from splunk.mining.DateParser import _validateDate, _validateTime
from splunk.clilib.bundle_paths import make_splunkhome_path


WORD_REGEX = re.compile(r'[^a-zA-Z0-9]+')
WORD_SPLIT = re.compile(r'([^a-zA-Z0-9]+)')

def _generateReplacement(term, nameterms):
    replacement = ""
    if looksLikeWord(term):
        # get list of names with the same length as the term
        names = nameterms.get(len(term), None)
        if names != None:
            nameCount = len(names)
            if nameCount > 0:
                index = random.randint(1, nameCount)
                replacement = names[index-1]
                del names[index-1]
                return replacement

    for ch in term:
        if ch.isdigit():
            # return a new number that is randomly less than the given value, so that ip addresses, and codes
            # are not higher than the value given.  otherwise we wil get ip addresses like 554.785.455.545.
            # this assumes that if given a number, a number lower than it will be equally valid
            maxVal = int(ch)
            newch = str(random.randint(0,maxVal))
        elif ch.isalpha():
            if ch.islower():
                newch = chr(random.randint(97,122))
            else:
                newch = chr(random.randint(65,90))
        else:
            newch = ch
        replacement += newch
    return replacement

def lengthLists(terms):
    result = dict()
    for key in terms.keys():
        addToMapList(result, len(key), key)
    return result


############################# DATEFINDER

def findAllDatesAndTimes(text, timeInfoTuplet):
    global today, _MIN_YEAR, _MAX_YEAR

    timeExpressions = timeInfoTuplet[0]
    dateExpressions = timeInfoTuplet[1]
    matches = getAllMatches(text, dateExpressions, _validateDate)
    matches.extend(getAllMatches(text, timeExpressions, _validateTime))
    return matches


def getAllMatches(text, expressions, validator):
    index = -1
    matches = list()
    for expression in expressions:
        index += 1
        for match in expression.finditer(text):
            values = match.groupdict()
            isvalid = validator(values)
            if isvalid:
                matches.append(match.span())
    return matches

# return true if position is between any start-end in list of regions
def inRegions(position, regions):
    for region in regions:
        start = region[0]
        end = region[1]
        if start <= position <= end:
            return True
    return False

def compilePatterns(formats):
    compiledList = list()
    for format in formats:
        compiledList.append(re.compile(format, re.I))
    return compiledList

def getTimeInfoTuplet(timestampconfilename):
    root = os.path.realpath(make_splunkhome_path(['etc', 'anonymizer']))
    if not os.path.isabs(root):
        root = os.path.abspath(root)

    timestampconfilename = os.path.realpath(os.path.normpath(timestampconfilename))
    if not os.path.isabs(timestampconfilename):
        timestampconfilename = os.path.abspath(timestampconfilename)

    if root != os.path.commonprefix([root, timestampconfilename]):
        print('*** File is not inside proper directory %s should be in %s'%(timestampconfilename, root))
        raise Exception('*** File is not inside proper directory %s should be in %s'%(timestampconfilename, root))

    text = readText(timestampconfilename)
    text = text.replace('\\n', '\n').replace('\n\n', '\n')
    results = {}
    exec(text, {"__builtins__":None}, results)
    compiledTimePatterns = compilePatterns(results['timePatterns'])
    compiledDatePatterns = compilePatterns(results['datePatterns'])
    timeInfoTuplet = [compiledTimePatterns, compiledDatePatterns, results['minYear'], results['maxYear']]
    return timeInfoTuplet

############################# DATEFINDER

################################### BEGIN COPIED FROM DCUTILS.PY

def addToMapList(map, key, value):
    if key in map:
        l = map[key]
    else:
        l = list()
        map[key] = l
    safeAppend(l, value)
    return l


def fileWords(filename, lowercase):
    terms = dict()
    with open(filename) as f:
        count = 1
        while (True):
            line = f.readline()
            if (lowercase):
                line = line.lower()
            if len(line) == 0:
                break
            tokenize(line, False, terms)
            ##Is it possible to do previews from a search script?
            #if count % 100000 == 0:
            #    print('\t%u processed...' % count)
            count += 1
    return terms

def readText(filename):
    # really, this needs a function?
    with open(filename) as f:
        text = f.read()
        return text

MAX_SEGMENT = 1024

def findBreak(start, segSize, text):
    segEnd = start + segSize - 1
    if segEnd > len(text):
        return len(text)-1
    for end in range(segEnd, max(start+1, segEnd-100), -1):
        if not text[end].isalnum():
            return end
    # failed to find break by going back 100 chars.  give up and break at will.
    return segEnd

# returns maps of terms and phrases to their count
def tokenize(text, wordsOnlyP, vector = dict()):
    segCount = int((len(text) + MAX_SEGMENT-1) / MAX_SEGMENT)
    segStart = 0

    for seg in range(0, segCount):
        segEnd = findBreak(segStart, MAX_SEGMENT, text)
        segText = text[segStart:segEnd+1]
        tokens = WORD_REGEX.split(segText)
        for token in tokens:
            if len(token) == 0:
                continue
            if not wordsOnlyP or looksLikeWord(token):
                incCount(vector, token, 1)
        segStart = segEnd+1
    return vector


def looksLikeWord(token):
    upper = lower = 0
    for c in token:
        if not c.isalpha():
            return False
        if c.isupper():
            upper += 1
        else:
            lower += 1
    return len(token) > 2 and (upper == 0 or lower == 0 or upper == 1)

def incCount(map, val, count):
    if val in map:
        map[val] += count
    else:
        map[val] = count


def safeAppend(list, val):
    if val not in list:
        list.append(val)

################################### END COPIED FROM DCUTILS.PY

def isInt(token):
    if len(token) > 0 and  token[0].isdigit():
        try:
            int(token)
            return True
        except:
            pass
    return False

def caseSame(caseSource, textSource):
    result = "";
    for i in range(0, len(caseSource)):
        casech = caseSource[i]
        textch = textSource[i]
        if casech.isupper():
            textch = textch.upper()
        elif casech.islower():
            textch = textch.lower()
        result += textch;
    return result;


def scrubValue(result, val, isRaw, allterms, replacements, publicTerms, privateTerms, nameTerms, timeInfoTuplet):

    regions = []
    if isRaw:
        regions = findAllDatesAndTimes(val, timeInfoTuplet)
    position = 0
    tokens = re.split(WORD_SPLIT, val)
    newtokens = list()
    for token in tokens:
        lower = token.lower()
        newtoken = token
        incCount(allterms, token, 1)
        inDateRegion = inRegions(position, regions)
        # if term is name of not an attribute and not in a date region.
        # double check for numbers of public terms because date regions sometimes
        # have extraineous text if the regex matches contains a noise term or end of expression match
        if (result.get(lower, None) == None) and not (inDateRegion and (isInt(token) or (lower in publicTerms and lower not in privateTerms))):
            # if we haven't already made a replacement for this term and it's a private term or not a public term
            if lower not in replacements and (lower in privateTerms or lower not in publicTerms):
                replacements[lower] = newtoken = _generateReplacement(token, nameTerms) # make a replacement term
            newtoken = replacements.get(lower, token)
            newtoken = caseSame(token, newtoken)
        position += len(token)
        newtokens.append(newtoken)
    return ''.join(newtokens)

def scrub(results, publictermsfilename, privatefilename, nametermsfilename, dictionaryfilename, timestampconfigfilename):

    replacements = dict()
    privateTerms = fileWords(privatefilename, True)
    publicTerms = fileWords(dictionaryfilename, True)
    userpublicTerms = fileWords(publictermsfilename, True)
    nameTerms = lengthLists(fileWords(nametermsfilename, True))

    # add user public terms to default publicterms
    for t in userpublicTerms:
        publicTerms[t] = userpublicTerms[t]
    # add named entities to default publicterms

    protectedKeys = set(["eventtype", "linecount", "punct", "sourcetype", "timeendpos", "timestartpos"])

    timeInfoTuplet = getTimeInfoTuplet(timestampconfigfilename)
    allterms = dict()
    # for each result
    for r in results:
        # for each attribute
        for key,val in r.items():
            # only scrub attributes if doesn't start with '_' (except _raw) and if not a protected attribute and doesn't start with date_
            if (not key.startswith("_") or key == "_raw") and not key in protectedKeys and not key.startswith("date_"):
                r[key] = scrubValue(r, val, key=="_raw", allterms, replacements, publicTerms, privateTerms, nameTerms, timeInfoTuplet)

def locate_anonymize_file(filename, app_dir, err_collection):
  # paths aren't accepted
  if "/" in filename or "\\" in filename or ".." in filename:
    msg = ("Pathnames are not accepted for any of the filename arguments.  " +
           "The file specifier '%s' is not permitted.")
    err_collection.append(msg % filename)
    return None

  anonymize_dir = 'anonymizer'
  if app_dir:
    app_file_path = os.path.join(app_dir, anonymize_dir, filename)
    if os.path.isfile(app_file_path):
      return app_file_path

  global_file_path = make_splunkhome_path(['etc', anonymize_dir, filename])
  if os.path.isfile(global_file_path):
    return global_file_path

  # we couldn't find the file, so..
  msg = "The filename '%s' could not be found in the " % filename
  if app_dir:
    msg += "app or "
  msg += "the global directory.  Checked "
  if app_dir:
    msg += "'%s' and " % app_file_path
  msg += "'%s', but did not locate the file." % global_file_path
  err_collection.append(msg)
  return None

if __name__ == '__main__':
  try:
    results,dummyresults,settings = splunk.Intersplunk.getOrganizedResults()
    argc = len(sys.argv)
    argv = sys.argv

    # if this is nonempty later, we'll write it out as error
    err_results = []

    # DEFAULT CONFIG FILE NAMES
    publictermsfilename     = "public-terms.txt"
    privatetermsfilename    = "private-terms.txt"
    nametermsfilename       = "names.txt"
    dictionaryfilename      = "dictionary.txt"
    timestampconfigfilename = 'anonymizer-time.ini'

    # GET ARGS
    keywords, argvals = splunk.Intersplunk.getKeywordsAndOptions() # argvals = splunk.dcutils.getArgValues()

    # ALLOW ARGS TO OVERRIDE DEFAULTS
    publictermsfilename = argvals.get("public-terms", publictermsfilename)
    privatetermsfilename = argvals.get("private-terms", privatetermsfilename)
    nametermsfilename = argvals.get("name-terms", nametermsfilename)
    dictionaryfilename = argvals.get("dictionary", dictionaryfilename)
    timestampconfigfilename = argvals.get("time-config", timestampconfigfilename)

    # locate the files
    app = argvals.get("namespace")

    # first find the app, if it exists
    app_dir = None
    if app:
      if "/" in app or "\\" in app or ".." in app:
        msg = "Error: namespace name may not include the '/' '\\' or '..' sequences"
        err_results.append(msg)
      else:
        app_dir = make_splunkhome_path(['etc', 'apps', app])
        if not os.path.isdir(app_dir):
          app_dir = make_splunkhome_path(['etc', comm.getAppDir(), app])
          if not os.path.isdir(app_dir):
            msg = "Error: could not find specified app '%s' on disk" % app
            err_results.append(msg)
            app_dir = None

    # now find each file in either the app or the global dir
    publicterms_path = locate_anonymize_file(publictermsfilename,
                                             app_dir, err_results)
    privateterms_path = locate_anonymize_file(privatetermsfilename,
                                              app_dir, err_results)
    nameterms_path = locate_anonymize_file(nametermsfilename,
                                           app_dir, err_results)
    dictionary_path = locate_anonymize_file(dictionaryfilename,
                                            app_dir, err_results)
    timestampconfig_path = locate_anonymize_file(timestampconfigfilename,
                                                 app_dir, err_results)

    if not err_results:
      scrub(results, publicterms_path, privateterms_path,
            nameterms_path, dictionary_path, timestampconfig_path)

  except:
    import traceback
    stack =  traceback.format_exc()
    results = splunk.Intersplunk.generateErrorResults("Error : Traceback: " + str(stack))
    err_results=[]

# pass back explicitly determined errors
if err_results:
  results = splunk.Intersplunk.generateErrorResults("\n".join(err_results))
splunk.Intersplunk.outputResults( results )