You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
476 lines
20 KiB
476 lines
20 KiB
import sys
|
|
import os
|
|
import re
|
|
from math import sqrt
|
|
import csv
|
|
from time import mktime, localtime, strptime
|
|
from datetime import tzinfo, timedelta, datetime
|
|
import operator
|
|
|
|
import splunk.Intersplunk
|
|
import splunk.stats_util.statespace as statespace
|
|
from splunk.stats_util.dist import Erf
|
|
from builtins import range
|
|
|
|
|
|
erf = Erf()
|
|
root2 = sqrt(2)
|
|
|
|
|
|
class FC:
|
|
def __init__(self, field=''):
|
|
self.options = {'algorithm':'LLP5', 'holdback':0, 'correlate':None, 'upper':None, 'lower':None, 'suppress':None,
|
|
'period':-1, 'as':'', 'future_timespan':'5', 'ci':'95', 'last':None, 'start':0, 'nonnegative':'f'}
|
|
self.setField(field)
|
|
self.vals = None
|
|
self.numvals = 0
|
|
self.correlate = []
|
|
self.conf = [erf.inverf(.95)*root2]*2
|
|
self.upper_conf = 95.
|
|
self.lower_conf = 95.
|
|
self.missingValued = False
|
|
self.databegun = False
|
|
|
|
def __str__(self):
|
|
ordered_fields = sorted(self.fields.items(), key=operator.itemgetter(1), reverse=True)
|
|
ret = str(ordered_fields) + ", options: {"
|
|
for key in sorted(self.options):
|
|
ret += " %s: %s," %(key, self.options[key])
|
|
ret += "}"
|
|
return ret
|
|
|
|
def setField(self, field):
|
|
if field != '':
|
|
self.fields = {field: 'prediction(' + field + ')'}
|
|
self.fieldValMap = {field:0}
|
|
self.options['as'] = self.fields[field]
|
|
self.asNames = {field: field}
|
|
else:
|
|
self.fields = {}
|
|
self.fieldValMap = {}
|
|
self.asNames = {}
|
|
self.iscount = {}
|
|
|
|
def addField(self, field):
|
|
self.setAsName(field, 'prediction(' + field + ')' )
|
|
self.fieldValMap[field] = len(self.fields) - 1
|
|
|
|
def setAsName(self, field, name):
|
|
self.options['as'] = name
|
|
self.asNames[field] = name
|
|
self.fields[field] = name
|
|
|
|
|
|
def addVal(self, field, val):
|
|
idx = self.fieldValMap[field]
|
|
self.vals[idx].append(val)
|
|
|
|
def setUpperLowerNames(self):
|
|
self.upperNames = {}
|
|
self.lowerNames = {}
|
|
self.UIupperNames = {}
|
|
self.UIlowerNames = {}
|
|
self.UIpredictNames = {}
|
|
for field in self.fields:
|
|
if self.options['upper'] != None:
|
|
self.upperNames[field] = self.options['upper'] + '(' + self.fields[field] + ')'
|
|
else:
|
|
self.upperNames[field] = 'upper' + self.options['ci'] + '(' + self.fields[field] + ')'
|
|
if self.options['lower'] != None:
|
|
self.lowerNames[field] = self.options['lower'] + '(' + self.fields[field] + ')'
|
|
else:
|
|
self.lowerNames[field] = 'lower' + self.options['ci'] + '(' + self.fields[field] + ')'
|
|
self.UIupperNames[field] = '_upper' + field
|
|
self.UIlowerNames[field] = '_lower' + field
|
|
self.UIpredictNames[field] = '_predicted' + field
|
|
|
|
def setModel(self):
|
|
if self.options['algorithm'] not in statespace.ALGORITHMS:
|
|
splunk.Intersplunk.generateErrorResults("Unknown algorithm: %s" %self.options['algorithm'])
|
|
sys.exit()
|
|
data_end = self.numvals - self.holdback
|
|
if data_end < statespace.LL.least_num_data():
|
|
splunk.Intersplunk.generateErrorResults("Too few data points: %d. Need at least %d (too many holdbacks (%d) maybe?)" %(data_end, statespace.LL.least_num_data(), self.holdback))
|
|
sys.exit()
|
|
|
|
self.data_end = data_end
|
|
self.data_start = 0
|
|
algorithm = self.options['algorithm']
|
|
vals = self.vals
|
|
future_timespan = self.future_timespan
|
|
|
|
try:
|
|
if algorithm[:3] == 'LLP':
|
|
self.model = statespace.Univar(algorithm, vals, self.data_start, self.data_end, period=self.period, forecast_len=future_timespan, missingValued=self.missingValued)
|
|
elif algorithm[:3] == 'LLB': # one of the LLB's
|
|
if len(self.correlate) == 0:
|
|
splunk.Intersplunk.parseError("No correlate values")
|
|
sys.exit()
|
|
if data_end < statespace.LLB.least_num_data():
|
|
splunk.Intersplunk.generateErrorResults("Too few data points: %d. Need at least %d" %(data_end, statespace.LLB.least_num_data()))
|
|
sys.exit()
|
|
self.model = statespace.Multivar(algorithm, vals, self.numvals, correlate=self.correlate, missingValued=self.missingValued)
|
|
elif algorithm[:2] == 'Bi': # one of the bivariate algorithms
|
|
self.model = statespace.Multivar(algorithm, vals, data_end, forecast_len=future_timespan, missingValued=self.missingValued)
|
|
else:
|
|
self.model = statespace.Univar(algorithm, vals, self.data_start, self.data_end, forecast_len=future_timespan, missingValued=self.missingValued)
|
|
except (AttributeError, ValueError) as e:
|
|
splunk.Intersplunk.parseError(str(e))
|
|
sys.exit()
|
|
|
|
def predict(self):
|
|
model = self.model
|
|
if model.datalen() < model.least_num_data():
|
|
splunk.Intersplunk.generateErrorResults("Too few data points: %d. Need at least %d" %(model.datalen(), model.least_num_data()))
|
|
sys.exit()
|
|
|
|
if self.options['algorithm'][:3] == 'LLB':
|
|
start = max(self.data_end, 1)
|
|
model.predict(0, start)
|
|
self.future_timespan = 0
|
|
self.lag = start + self.data_start
|
|
else:
|
|
self.lag = model.first_forecast_index() + self.data_start
|
|
|
|
|
|
def setNonnegativity(self):
|
|
''' If user set the 'nonnegative' option to true, then treat the fields as nonnegative and return.
|
|
If not, then detect whether a field should be nonnegative by matching it with the countpattern below.
|
|
'''
|
|
if self.options['nonnegative'].lower() == 't':
|
|
for field in self.fields:
|
|
self.iscount[field] = True
|
|
return
|
|
countpattern = re.compile(r'^(c|count|dc|distinct_count|estdc)($|\()')
|
|
for field in self.fields:
|
|
if countpattern.match(field.lower()) or countpattern.match(self.asNames[field].lower()):
|
|
self.iscount[field] = True
|
|
else:
|
|
self.iscount[field] = False
|
|
|
|
|
|
def getSpans(self, results):
|
|
'''
|
|
My understanding of the span fields is that:
|
|
1. If _spandays isn't set, then _span is correct and counts the number of seconds since the epoch as defined in python.
|
|
In particular, minute and hour spans are converted to _span correctly.
|
|
2. If _spandays is set, then _span isn't always correct because of daylight time saving. So one should ignore _span in this case
|
|
and use _spandays instead. We need to convert _spandays to seconds by using python's struct_time, localtime() and mktime().
|
|
3. There is no _spanmonths field, but our convention is: if _spandays >= 28, then the month must be incremented and after that _spandays
|
|
should be ignored. Hence, if _spandays >= 28, then we define spanmonths = _spandays/28 and then ignore _spandays.
|
|
'''
|
|
spandays = spanmonths = None
|
|
if '_span' in results[0].keys():
|
|
span = int(results[0]['_span'])
|
|
if '_spandays' in results[0].keys():
|
|
spandays = int(results[0]['_spandays'])
|
|
if spandays >= 28:
|
|
spanmonths = int(spandays/28)
|
|
elif '_time' in results[0].keys() and '_time' in results[1].keys():
|
|
span = int(float(results[1]['_time']) - float(results[0]['_time']))
|
|
else:
|
|
splunk.Intersplunk.generateErrorResults("Unable to predict: data has no time")
|
|
sys.exit()
|
|
return (span, spandays, spanmonths)
|
|
|
|
|
|
def output(self, results):
|
|
model = self.model
|
|
beginning = self.beginning
|
|
lag = self.lag
|
|
datalen = model.datalen()
|
|
data_start = self.data_start
|
|
options = self.options
|
|
|
|
ext = self.numvals - self.holdback + self.future_timespan
|
|
if self.options['algorithm'][:3] == 'LLB':
|
|
kk = min(len(results)-beginning, self.numvals)
|
|
else:
|
|
kk = min(len(results)-beginning, ext)
|
|
|
|
# Since no numbers were present before 'beginning', we should leave those positions empty in the results.
|
|
# All predictions are pushed forward (in the results array) by the 'beginning' amount. Without this forward push the
|
|
# predictions would be displayed at the wrong positions in the graphs: the predictions would appear
|
|
# before the actual data. See SPL-80502.
|
|
for i in range(beginning + min(lag, datalen)):
|
|
for field in self.fields:
|
|
results[i][self.UIpredictNames[field]] = self.fields[field]
|
|
results[i][self.fields[field]] = None
|
|
results[i][self.UIupperNames[field]] = self.upperNames[field]
|
|
results[i][self.upperNames[field]] = None
|
|
results[i][self.UIlowerNames[field]] = self.lowerNames[field]
|
|
results[i][self.lowerNames[field]] = None
|
|
|
|
self.setNonnegativity()
|
|
|
|
for i in range(lag,kk):
|
|
j = i - data_start
|
|
I = i + beginning
|
|
for field in self.fields:
|
|
if self.options['suppress'] == field:
|
|
continue
|
|
field_idx = self.fieldValMap[field]
|
|
state = model.state(field_idx,j)
|
|
|
|
if model.var(field_idx,j) == None:
|
|
print("None at j = %s" % j)
|
|
print("state = %s" % state)
|
|
continue
|
|
|
|
tmp = sqrt(abs(model.var(field_idx,j)))
|
|
upper = state + self.conf[0]*tmp
|
|
lower = state - self.conf[1]*tmp
|
|
if self.iscount[field] and lower < 0: lower = 0.0
|
|
results[I][self.UIpredictNames[field]] = self.fields[field]
|
|
results[I][self.fields[field]] = str(state)
|
|
results[I][self.UIupperNames[field]] = self.upperNames[field]
|
|
results[I][self.upperNames[field]] = str(upper)
|
|
results[I][self.UIlowerNames[field]] = self.lowerNames[field]
|
|
results[I][self.lowerNames[field]] = str(lower)
|
|
|
|
# SPL-181973 For datasets that have NULL data at the start of the time range, which can occur when "earliest" or the
|
|
# time picker is used, the lasttime will need to account for where the data actually begins.
|
|
# For results with full data sets, the result will always begin at 0
|
|
if '_time' in results[kk + beginning - 1]:
|
|
lasttime = float(results[kk + beginning - 1]['_time'])
|
|
else:
|
|
splunk.Intersplunk.generateErrorResults("Unable to predict: data has no time")
|
|
sys.exit()
|
|
lasttime_struct = list(localtime(lasttime)) # convert to list since localtime() returns readonly objects
|
|
(span, spandays, spanmonths) = self.getSpans(results)
|
|
for i in range(kk,ext): # if this range is non-empty, that means ext > len(results); hence we should append to results
|
|
j = i - data_start
|
|
(extendedtime, lasttime_struct) = self.computeExtendedTime(lasttime_struct, span, spandays, spanmonths)
|
|
newdict = {'_time': str(extendedtime)}
|
|
for field in self.fields:
|
|
if self.options['suppress'] == field:
|
|
continue
|
|
field_idx = self.fieldValMap[field]
|
|
state = model.state(field_idx, j)
|
|
tmp = sqrt(abs(model.var(field_idx,j)))
|
|
upper = state + self.conf[0]*tmp
|
|
lower = state - self.conf[1]*tmp
|
|
if self.iscount[field] and lower < 0: lower = 0.0
|
|
newdict[self.UIpredictNames[field]] = self.fields[field]
|
|
newdict[self.fields[field]] = str(state)
|
|
newdict[self.UIupperNames[field]] = self.upperNames[field]
|
|
newdict[self.upperNames[field]] = str(upper)
|
|
newdict[self.UIlowerNames[field]] = self.lowerNames[field]
|
|
newdict[self.lowerNames[field]] = str(lower)
|
|
results.append(newdict)
|
|
|
|
|
|
def computeExtendedTime(self, lasttime_struct, span, spandays, spanmonths):
|
|
hour = lasttime_struct[3]
|
|
if spanmonths:
|
|
lasttime_struct[1] += spanmonths # increment the tm_mon field in python's struct_time
|
|
elif spandays:
|
|
lasttime_struct[2] += spandays # increment the tm_mday field in python's struct_time
|
|
else:
|
|
lasttime_struct[5] += span
|
|
|
|
extendtime = mktime(tuple(lasttime_struct)) # convert back to seconds
|
|
lasttime_struct = list(localtime(extendtime))
|
|
|
|
# Dealing with daylight saving time. If the previous timestamp shows 12AM, we want
|
|
# the next timestamp to still be 12AM (not 1AM or 23PM) when users set span=1d or span=1mon
|
|
# even when DST is in effect.
|
|
if spandays != None:
|
|
if lasttime_struct[8]==1 and (lasttime_struct[3] > hour or (hour==23 and lasttime_struct[3]==0)):
|
|
extendtime -= 3600
|
|
lasttime_struct = list(localtime(extendtime))
|
|
elif lasttime_struct[8]==0 and (lasttime_struct[3] < hour or (hour==0 and lasttime_struct[3]==23)):
|
|
extendtime += 3600
|
|
lasttime_struct = list(localtime(extendtime))
|
|
return (extendtime, lasttime_struct)
|
|
|
|
|
|
def checkFutureTimespan(self):
|
|
try:
|
|
self.future_timespan = int(self.options['future_timespan'])
|
|
if self.future_timespan < 0:
|
|
raise ValueError
|
|
except ValueError:
|
|
splunk.Intersplunk.parseError("Invalid future_timespan: '%s'" %self.options['future_timespan'])
|
|
|
|
def checkPeriod(self):
|
|
self.period = self.options['period']
|
|
if self.period != -1:
|
|
try:
|
|
self.period = int(self.period)
|
|
if self.period < 1:
|
|
raise ValueError
|
|
except ValueError:
|
|
splunk.Intersplunk.parseError("Invalid period : '%s'" %self.options['period'])
|
|
|
|
def checkHoldback(self):
|
|
self.holdback = self.options['holdback']
|
|
if self.holdback:
|
|
try:
|
|
self.holdback = int(self.options['holdback'])
|
|
if self.holdback < 0:
|
|
raise ValueError
|
|
except ValueError:
|
|
splunk.Intersplunk.parseError("Invalid holdback: '%s'" %self.options['holdback'])
|
|
|
|
def checkDataStart(self):
|
|
try:
|
|
self.data_start = int(self.options['start'])
|
|
if self.data_start < 0:
|
|
raise ValueError
|
|
except ValueError:
|
|
splunk.Intersplunk.parseError("Invalid start: '%s'" %self.options['start'])
|
|
|
|
def checkNonnegative(self):
|
|
try:
|
|
self.nonnegative = bool(self.options['nonnegative'])
|
|
except ValueError:
|
|
splunk.Intersplunk.parseError("Invalid nonnegative value: '%s'" %self.options['nonnegative'])
|
|
|
|
def initVals(self):
|
|
self.vals = [None]*len(self.fields)
|
|
for i in range(len(self.vals)):
|
|
self.vals[i] = []
|
|
|
|
def lastCheck(self):
|
|
self.setUpperLowerNames() # if they weren't
|
|
self.checkFutureTimespan()
|
|
self.checkPeriod()
|
|
self.checkHoldback()
|
|
self.checkDataStart()
|
|
self.checkNonnegative()
|
|
self.initVals()
|
|
|
|
|
|
def parseOps(argv):
|
|
argc = len(argv)
|
|
if argc == 0: raise ValueError("No field specified")
|
|
|
|
fcs = [FC()]
|
|
|
|
i = 0
|
|
fc = fcs[-1]
|
|
current_field = None
|
|
while i < argc:
|
|
arg = str.lower(argv[i])
|
|
|
|
if arg == 'as':
|
|
if i+1 == argc or argv[i+1].find('=') != -1:
|
|
raise ValueError("missing new name after 'as'")
|
|
fc.setAsName(current_field,argv[i+1])
|
|
i += 2
|
|
continue
|
|
|
|
pos = arg.find("=")
|
|
if pos != -1:
|
|
attr = arg[:pos]
|
|
if attr in fc.options.keys():
|
|
if attr=='as':
|
|
fc.setAsName(current_field, argv[i][pos+1:])
|
|
else:
|
|
fc.options[attr] = argv[i][pos+1:]
|
|
elif attr[:5]=="upper":
|
|
try:
|
|
fc.upper_conf = float(attr[5:])
|
|
if fc.upper_conf < 0 or fc.upper_conf >= 100: raise ValueError
|
|
fc.conf[0] = erf.inverf(fc.upper_conf/100.)*root2
|
|
except ValueError:
|
|
raise ValueError("bad upper confidence interval")
|
|
fc.options['upper'] = argv[i][pos+1:]
|
|
elif attr[:5]=="lower":
|
|
try:
|
|
fc.lower_conf = float(attr[5:])
|
|
if fc.lower_conf < 0 or fc.lower_conf >= 100: raise ValueError
|
|
fc.conf[1] = erf.inverf(fc.lower_conf/100.)*root2
|
|
except ValueError:
|
|
raise ValueError("bad lower confidence interval")
|
|
fc.options['lower'] = argv[i][pos+1:]
|
|
else:
|
|
raise ValueError("unknown option %s" %arg)
|
|
i +=1
|
|
continue
|
|
|
|
if len(fc.fields) == 0:
|
|
isField = True
|
|
while isField:
|
|
fc.addField(argv[i])
|
|
current_field = argv[i]
|
|
i += 1
|
|
if i < argc:
|
|
arg = str.lower(argv[i])
|
|
if arg == 'as':
|
|
if i+1==argc or argv[i+1].find('=') != -1:
|
|
raise ValueError("missing new name after 'as'")
|
|
fc.setAsName(current_field,argv[i+1])
|
|
i += 2
|
|
if i >= argc: break
|
|
arg = str.lower(argv[i])
|
|
if arg.find('=') != -1:
|
|
isField = False
|
|
else: break
|
|
else:
|
|
fc.lastCheck() # if they weren't set
|
|
fcs.append(FC(argv[i])) # start new FC
|
|
current_field = argv[i]
|
|
fc = fcs[-1]
|
|
i += 1
|
|
|
|
fc.lastCheck() # if they weren't set
|
|
return fcs
|
|
|
|
def readSearchResults(results, fcs):
|
|
if len(results) == 0:
|
|
splunk.Intersplunk.generateErrorResults("No data")
|
|
sys.exit(0)
|
|
for fc in fcs:
|
|
for field in fc.fields:
|
|
if field not in results[0]:
|
|
splunk.Intersplunk.generateErrorResults("Unknown field: %s" %field)
|
|
sys.exit(0)
|
|
fc.beginning = 0
|
|
for res in results:
|
|
for fc in fcs:
|
|
for field in fc.fields:
|
|
if field in res:
|
|
try:
|
|
fc.addVal(field, float(res[field]))
|
|
fc.databegun = True
|
|
except ValueError:
|
|
if not fc.databegun:
|
|
fc.beginning += 1 # increase 'beginning' only when no numbers have been encountered
|
|
elif res[field]==None or res[field]=='':
|
|
fc.addVal(field, None)
|
|
fc.missingValued = True
|
|
if fc.options['correlate'] in res:
|
|
if res[fc.options['correlate']]==None or res[fc.options['correlate']]=='':
|
|
fc.correlate.append(None)
|
|
fc.missingValued = True
|
|
else:
|
|
try:
|
|
fc.correlate.append(float(res[fc.options['correlate']]))
|
|
except ValueError:
|
|
splunk.Intersplunk.parseError("bad correlate field value: " + res[fc.options['correlate']])
|
|
for fc in fcs:
|
|
fc.numvals = len(fc.vals[0])
|
|
|
|
|
|
def predictAll(fcs, results):
|
|
readSearchResults(results, fcs)
|
|
for fc in fcs:
|
|
fc.setModel()
|
|
fc.predict()
|
|
fc.output(results)
|
|
|
|
if __name__ == "__main__":
|
|
(isgetinfo, sys.argv) = splunk.Intersplunk.isGetInfo(sys.argv)
|
|
if isgetinfo:
|
|
splunk.Intersplunk.outputInfo(False, False, True, False, None, True)
|
|
# outputInfo automatically calls sys.exit()
|
|
try:
|
|
forecaster = parseOps(sys.argv[1:])
|
|
except ValueError as e:
|
|
splunk.Intersplunk.parseError(str(e))
|
|
results = splunk.Intersplunk.readResults(None, None, False)
|
|
predictAll(forecaster, results)
|
|
splunk.Intersplunk.outputResults(results)
|