You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

476 lines
20 KiB

import sys
import os
import re
from math import sqrt
import csv
from time import mktime, localtime, strptime
from datetime import tzinfo, timedelta, datetime
import operator
import splunk.Intersplunk
import splunk.stats_util.statespace as statespace
from splunk.stats_util.dist import Erf
from builtins import range
erf = Erf()
root2 = sqrt(2)
class FC:
def __init__(self, field=''):
self.options = {'algorithm':'LLP5', 'holdback':0, 'correlate':None, 'upper':None, 'lower':None, 'suppress':None,
'period':-1, 'as':'', 'future_timespan':'5', 'ci':'95', 'last':None, 'start':0, 'nonnegative':'f'}
self.setField(field)
self.vals = None
self.numvals = 0
self.correlate = []
self.conf = [erf.inverf(.95)*root2]*2
self.upper_conf = 95.
self.lower_conf = 95.
self.missingValued = False
self.databegun = False
def __str__(self):
ordered_fields = sorted(self.fields.items(), key=operator.itemgetter(1), reverse=True)
ret = str(ordered_fields) + ", options: {"
for key in sorted(self.options):
ret += " %s: %s," %(key, self.options[key])
ret += "}"
return ret
def setField(self, field):
if field != '':
self.fields = {field: 'prediction(' + field + ')'}
self.fieldValMap = {field:0}
self.options['as'] = self.fields[field]
self.asNames = {field: field}
else:
self.fields = {}
self.fieldValMap = {}
self.asNames = {}
self.iscount = {}
def addField(self, field):
self.setAsName(field, 'prediction(' + field + ')' )
self.fieldValMap[field] = len(self.fields) - 1
def setAsName(self, field, name):
self.options['as'] = name
self.asNames[field] = name
self.fields[field] = name
def addVal(self, field, val):
idx = self.fieldValMap[field]
self.vals[idx].append(val)
def setUpperLowerNames(self):
self.upperNames = {}
self.lowerNames = {}
self.UIupperNames = {}
self.UIlowerNames = {}
self.UIpredictNames = {}
for field in self.fields:
if self.options['upper'] != None:
self.upperNames[field] = self.options['upper'] + '(' + self.fields[field] + ')'
else:
self.upperNames[field] = 'upper' + self.options['ci'] + '(' + self.fields[field] + ')'
if self.options['lower'] != None:
self.lowerNames[field] = self.options['lower'] + '(' + self.fields[field] + ')'
else:
self.lowerNames[field] = 'lower' + self.options['ci'] + '(' + self.fields[field] + ')'
self.UIupperNames[field] = '_upper' + field
self.UIlowerNames[field] = '_lower' + field
self.UIpredictNames[field] = '_predicted' + field
def setModel(self):
if self.options['algorithm'] not in statespace.ALGORITHMS:
splunk.Intersplunk.generateErrorResults("Unknown algorithm: %s" %self.options['algorithm'])
sys.exit()
data_end = self.numvals - self.holdback
if data_end < statespace.LL.least_num_data():
splunk.Intersplunk.generateErrorResults("Too few data points: %d. Need at least %d (too many holdbacks (%d) maybe?)" %(data_end, statespace.LL.least_num_data(), self.holdback))
sys.exit()
self.data_end = data_end
self.data_start = 0
algorithm = self.options['algorithm']
vals = self.vals
future_timespan = self.future_timespan
try:
if algorithm[:3] == 'LLP':
self.model = statespace.Univar(algorithm, vals, self.data_start, self.data_end, period=self.period, forecast_len=future_timespan, missingValued=self.missingValued)
elif algorithm[:3] == 'LLB': # one of the LLB's
if len(self.correlate) == 0:
splunk.Intersplunk.parseError("No correlate values")
sys.exit()
if data_end < statespace.LLB.least_num_data():
splunk.Intersplunk.generateErrorResults("Too few data points: %d. Need at least %d" %(data_end, statespace.LLB.least_num_data()))
sys.exit()
self.model = statespace.Multivar(algorithm, vals, self.numvals, correlate=self.correlate, missingValued=self.missingValued)
elif algorithm[:2] == 'Bi': # one of the bivariate algorithms
self.model = statespace.Multivar(algorithm, vals, data_end, forecast_len=future_timespan, missingValued=self.missingValued)
else:
self.model = statespace.Univar(algorithm, vals, self.data_start, self.data_end, forecast_len=future_timespan, missingValued=self.missingValued)
except (AttributeError, ValueError) as e:
splunk.Intersplunk.parseError(str(e))
sys.exit()
def predict(self):
model = self.model
if model.datalen() < model.least_num_data():
splunk.Intersplunk.generateErrorResults("Too few data points: %d. Need at least %d" %(model.datalen(), model.least_num_data()))
sys.exit()
if self.options['algorithm'][:3] == 'LLB':
start = max(self.data_end, 1)
model.predict(0, start)
self.future_timespan = 0
self.lag = start + self.data_start
else:
self.lag = model.first_forecast_index() + self.data_start
def setNonnegativity(self):
''' If user set the 'nonnegative' option to true, then treat the fields as nonnegative and return.
If not, then detect whether a field should be nonnegative by matching it with the countpattern below.
'''
if self.options['nonnegative'].lower() == 't':
for field in self.fields:
self.iscount[field] = True
return
countpattern = re.compile(r'^(c|count|dc|distinct_count|estdc)($|\()')
for field in self.fields:
if countpattern.match(field.lower()) or countpattern.match(self.asNames[field].lower()):
self.iscount[field] = True
else:
self.iscount[field] = False
def getSpans(self, results):
'''
My understanding of the span fields is that:
1. If _spandays isn't set, then _span is correct and counts the number of seconds since the epoch as defined in python.
In particular, minute and hour spans are converted to _span correctly.
2. If _spandays is set, then _span isn't always correct because of daylight time saving. So one should ignore _span in this case
and use _spandays instead. We need to convert _spandays to seconds by using python's struct_time, localtime() and mktime().
3. There is no _spanmonths field, but our convention is: if _spandays >= 28, then the month must be incremented and after that _spandays
should be ignored. Hence, if _spandays >= 28, then we define spanmonths = _spandays/28 and then ignore _spandays.
'''
spandays = spanmonths = None
if '_span' in results[0].keys():
span = int(results[0]['_span'])
if '_spandays' in results[0].keys():
spandays = int(results[0]['_spandays'])
if spandays >= 28:
spanmonths = int(spandays/28)
elif '_time' in results[0].keys() and '_time' in results[1].keys():
span = int(float(results[1]['_time']) - float(results[0]['_time']))
else:
splunk.Intersplunk.generateErrorResults("Unable to predict: data has no time")
sys.exit()
return (span, spandays, spanmonths)
def output(self, results):
model = self.model
beginning = self.beginning
lag = self.lag
datalen = model.datalen()
data_start = self.data_start
options = self.options
ext = self.numvals - self.holdback + self.future_timespan
if self.options['algorithm'][:3] == 'LLB':
kk = min(len(results)-beginning, self.numvals)
else:
kk = min(len(results)-beginning, ext)
# Since no numbers were present before 'beginning', we should leave those positions empty in the results.
# All predictions are pushed forward (in the results array) by the 'beginning' amount. Without this forward push the
# predictions would be displayed at the wrong positions in the graphs: the predictions would appear
# before the actual data. See SPL-80502.
for i in range(beginning + min(lag, datalen)):
for field in self.fields:
results[i][self.UIpredictNames[field]] = self.fields[field]
results[i][self.fields[field]] = None
results[i][self.UIupperNames[field]] = self.upperNames[field]
results[i][self.upperNames[field]] = None
results[i][self.UIlowerNames[field]] = self.lowerNames[field]
results[i][self.lowerNames[field]] = None
self.setNonnegativity()
for i in range(lag,kk):
j = i - data_start
I = i + beginning
for field in self.fields:
if self.options['suppress'] == field:
continue
field_idx = self.fieldValMap[field]
state = model.state(field_idx,j)
if model.var(field_idx,j) == None:
print("None at j = %s" % j)
print("state = %s" % state)
continue
tmp = sqrt(abs(model.var(field_idx,j)))
upper = state + self.conf[0]*tmp
lower = state - self.conf[1]*tmp
if self.iscount[field] and lower < 0: lower = 0.0
results[I][self.UIpredictNames[field]] = self.fields[field]
results[I][self.fields[field]] = str(state)
results[I][self.UIupperNames[field]] = self.upperNames[field]
results[I][self.upperNames[field]] = str(upper)
results[I][self.UIlowerNames[field]] = self.lowerNames[field]
results[I][self.lowerNames[field]] = str(lower)
# SPL-181973 For datasets that have NULL data at the start of the time range, which can occur when "earliest" or the
# time picker is used, the lasttime will need to account for where the data actually begins.
# For results with full data sets, the result will always begin at 0
if '_time' in results[kk + beginning - 1]:
lasttime = float(results[kk + beginning - 1]['_time'])
else:
splunk.Intersplunk.generateErrorResults("Unable to predict: data has no time")
sys.exit()
lasttime_struct = list(localtime(lasttime)) # convert to list since localtime() returns readonly objects
(span, spandays, spanmonths) = self.getSpans(results)
for i in range(kk,ext): # if this range is non-empty, that means ext > len(results); hence we should append to results
j = i - data_start
(extendedtime, lasttime_struct) = self.computeExtendedTime(lasttime_struct, span, spandays, spanmonths)
newdict = {'_time': str(extendedtime)}
for field in self.fields:
if self.options['suppress'] == field:
continue
field_idx = self.fieldValMap[field]
state = model.state(field_idx, j)
tmp = sqrt(abs(model.var(field_idx,j)))
upper = state + self.conf[0]*tmp
lower = state - self.conf[1]*tmp
if self.iscount[field] and lower < 0: lower = 0.0
newdict[self.UIpredictNames[field]] = self.fields[field]
newdict[self.fields[field]] = str(state)
newdict[self.UIupperNames[field]] = self.upperNames[field]
newdict[self.upperNames[field]] = str(upper)
newdict[self.UIlowerNames[field]] = self.lowerNames[field]
newdict[self.lowerNames[field]] = str(lower)
results.append(newdict)
def computeExtendedTime(self, lasttime_struct, span, spandays, spanmonths):
hour = lasttime_struct[3]
if spanmonths:
lasttime_struct[1] += spanmonths # increment the tm_mon field in python's struct_time
elif spandays:
lasttime_struct[2] += spandays # increment the tm_mday field in python's struct_time
else:
lasttime_struct[5] += span
extendtime = mktime(tuple(lasttime_struct)) # convert back to seconds
lasttime_struct = list(localtime(extendtime))
# Dealing with daylight saving time. If the previous timestamp shows 12AM, we want
# the next timestamp to still be 12AM (not 1AM or 23PM) when users set span=1d or span=1mon
# even when DST is in effect.
if spandays != None:
if lasttime_struct[8]==1 and (lasttime_struct[3] > hour or (hour==23 and lasttime_struct[3]==0)):
extendtime -= 3600
lasttime_struct = list(localtime(extendtime))
elif lasttime_struct[8]==0 and (lasttime_struct[3] < hour or (hour==0 and lasttime_struct[3]==23)):
extendtime += 3600
lasttime_struct = list(localtime(extendtime))
return (extendtime, lasttime_struct)
def checkFutureTimespan(self):
try:
self.future_timespan = int(self.options['future_timespan'])
if self.future_timespan < 0:
raise ValueError
except ValueError:
splunk.Intersplunk.parseError("Invalid future_timespan: '%s'" %self.options['future_timespan'])
def checkPeriod(self):
self.period = self.options['period']
if self.period != -1:
try:
self.period = int(self.period)
if self.period < 1:
raise ValueError
except ValueError:
splunk.Intersplunk.parseError("Invalid period : '%s'" %self.options['period'])
def checkHoldback(self):
self.holdback = self.options['holdback']
if self.holdback:
try:
self.holdback = int(self.options['holdback'])
if self.holdback < 0:
raise ValueError
except ValueError:
splunk.Intersplunk.parseError("Invalid holdback: '%s'" %self.options['holdback'])
def checkDataStart(self):
try:
self.data_start = int(self.options['start'])
if self.data_start < 0:
raise ValueError
except ValueError:
splunk.Intersplunk.parseError("Invalid start: '%s'" %self.options['start'])
def checkNonnegative(self):
try:
self.nonnegative = bool(self.options['nonnegative'])
except ValueError:
splunk.Intersplunk.parseError("Invalid nonnegative value: '%s'" %self.options['nonnegative'])
def initVals(self):
self.vals = [None]*len(self.fields)
for i in range(len(self.vals)):
self.vals[i] = []
def lastCheck(self):
self.setUpperLowerNames() # if they weren't
self.checkFutureTimespan()
self.checkPeriod()
self.checkHoldback()
self.checkDataStart()
self.checkNonnegative()
self.initVals()
def parseOps(argv):
argc = len(argv)
if argc == 0: raise ValueError("No field specified")
fcs = [FC()]
i = 0
fc = fcs[-1]
current_field = None
while i < argc:
arg = str.lower(argv[i])
if arg == 'as':
if i+1 == argc or argv[i+1].find('=') != -1:
raise ValueError("missing new name after 'as'")
fc.setAsName(current_field,argv[i+1])
i += 2
continue
pos = arg.find("=")
if pos != -1:
attr = arg[:pos]
if attr in fc.options.keys():
if attr=='as':
fc.setAsName(current_field, argv[i][pos+1:])
else:
fc.options[attr] = argv[i][pos+1:]
elif attr[:5]=="upper":
try:
fc.upper_conf = float(attr[5:])
if fc.upper_conf < 0 or fc.upper_conf >= 100: raise ValueError
fc.conf[0] = erf.inverf(fc.upper_conf/100.)*root2
except ValueError:
raise ValueError("bad upper confidence interval")
fc.options['upper'] = argv[i][pos+1:]
elif attr[:5]=="lower":
try:
fc.lower_conf = float(attr[5:])
if fc.lower_conf < 0 or fc.lower_conf >= 100: raise ValueError
fc.conf[1] = erf.inverf(fc.lower_conf/100.)*root2
except ValueError:
raise ValueError("bad lower confidence interval")
fc.options['lower'] = argv[i][pos+1:]
else:
raise ValueError("unknown option %s" %arg)
i +=1
continue
if len(fc.fields) == 0:
isField = True
while isField:
fc.addField(argv[i])
current_field = argv[i]
i += 1
if i < argc:
arg = str.lower(argv[i])
if arg == 'as':
if i+1==argc or argv[i+1].find('=') != -1:
raise ValueError("missing new name after 'as'")
fc.setAsName(current_field,argv[i+1])
i += 2
if i >= argc: break
arg = str.lower(argv[i])
if arg.find('=') != -1:
isField = False
else: break
else:
fc.lastCheck() # if they weren't set
fcs.append(FC(argv[i])) # start new FC
current_field = argv[i]
fc = fcs[-1]
i += 1
fc.lastCheck() # if they weren't set
return fcs
def readSearchResults(results, fcs):
if len(results) == 0:
splunk.Intersplunk.generateErrorResults("No data")
sys.exit(0)
for fc in fcs:
for field in fc.fields:
if field not in results[0]:
splunk.Intersplunk.generateErrorResults("Unknown field: %s" %field)
sys.exit(0)
fc.beginning = 0
for res in results:
for fc in fcs:
for field in fc.fields:
if field in res:
try:
fc.addVal(field, float(res[field]))
fc.databegun = True
except ValueError:
if not fc.databegun:
fc.beginning += 1 # increase 'beginning' only when no numbers have been encountered
elif res[field]==None or res[field]=='':
fc.addVal(field, None)
fc.missingValued = True
if fc.options['correlate'] in res:
if res[fc.options['correlate']]==None or res[fc.options['correlate']]=='':
fc.correlate.append(None)
fc.missingValued = True
else:
try:
fc.correlate.append(float(res[fc.options['correlate']]))
except ValueError:
splunk.Intersplunk.parseError("bad correlate field value: " + res[fc.options['correlate']])
for fc in fcs:
fc.numvals = len(fc.vals[0])
def predictAll(fcs, results):
readSearchResults(results, fcs)
for fc in fcs:
fc.setModel()
fc.predict()
fc.output(results)
if __name__ == "__main__":
(isgetinfo, sys.argv) = splunk.Intersplunk.isGetInfo(sys.argv)
if isgetinfo:
splunk.Intersplunk.outputInfo(False, False, True, False, None, True)
# outputInfo automatically calls sys.exit()
try:
forecaster = parseOps(sys.argv[1:])
except ValueError as e:
splunk.Intersplunk.parseError(str(e))
results = splunk.Intersplunk.readResults(None, None, False)
predictAll(forecaster, results)
splunk.Intersplunk.outputResults(results)

Powered by BW's shoe-string budget.