|
|
# Version 9.2.2.20240415
|
|
|
# DO NOT EDIT THIS FILE!
|
|
|
# Changes to default files will be lost on update and are difficult to
|
|
|
# manage and support.
|
|
|
#
|
|
|
# Please make any changes to system defaults by overriding them in
|
|
|
# apps or $SPLUNK_HOME/etc/system/local
|
|
|
# (See "Configuration file precedence" in the web documentation).
|
|
|
#
|
|
|
# To override a specific setting, copy the name of the stanza and
|
|
|
# setting to the file where you wish to override it.
|
|
|
#
|
|
|
# This file configures the splunkd health report.
|
|
|
#
|
|
|
[distributed_health_reporter]
|
|
|
disabled = 0
|
|
|
|
|
|
[health_reporter]
|
|
|
full_health_log_interval = 30
|
|
|
suppress_status_update_ms = 300
|
|
|
latency_tracker_log_interval = 30
|
|
|
aggregate_ingestion_latency_health = 1
|
|
|
ingestion_latency_send_interval = 30
|
|
|
ingestion_latency_send_interval_max = 86400
|
|
|
alert.disabled = 0
|
|
|
alert.actions = email
|
|
|
alert.min_duration_sec = 60
|
|
|
alert.threshold_color = red
|
|
|
alert.suppress_period = 10m
|
|
|
|
|
|
[alert_action:email]
|
|
|
disabled = 0
|
|
|
action.to =
|
|
|
action.cc =
|
|
|
action.bcc =
|
|
|
|
|
|
[alert_action:webhook]
|
|
|
disabled = 0
|
|
|
action.url =
|
|
|
|
|
|
[alert_action:pagerduty]
|
|
|
disabled = 0
|
|
|
action.integration_url_override =
|
|
|
|
|
|
[alert_action:mobile]
|
|
|
disabled = 0
|
|
|
action.alert_recipients =
|
|
|
|
|
|
[alert_action:victorops]
|
|
|
disabled = 0
|
|
|
action.message_type = CRITICAL
|
|
|
action.entity_id =
|
|
|
action.record_id =
|
|
|
action.routing_key_override =
|
|
|
|
|
|
[tree_view:health_subset]
|
|
|
|
|
|
[clustering]
|
|
|
disabled = 0
|
|
|
health_report_period = 20
|
|
|
|
|
|
|
|
|
# Health Report Features <In features tree order>
|
|
|
|
|
|
# File Monitor Input
|
|
|
[feature:batchreader]
|
|
|
display_name = Large and Archive File Reader
|
|
|
friendly_description = Shows if the Batch File Reader is able to add data from log files for processing.
|
|
|
indicator:data_out_rate:friendly_description = Batch File Reader is able to insert data into processing queues within the configured thresholds.
|
|
|
indicator:data_out_rate:description = This indicator reflects the number of consecutive times the Batch File Reader was unable to insert data into Splunk's processing queues for a period of 5 seconds. Each failed insertion attempt blocks the input processor for 5 seconds. By default, this indicator becomes Yellow when the insertion attempt fails once, Red after 2 consecutive failures.
|
|
|
indicator:data_out_rate:yellow = 1
|
|
|
indicator:data_out_rate:red = 2
|
|
|
alert.disabled = 1
|
|
|
|
|
|
[feature:tailreader]
|
|
|
display_name = Real-time Reader
|
|
|
friendly_description = Shows if the Tail File Reader isn't able to add data from log files for processing.
|
|
|
indicator:data_out_rate:friendly_description = Tail File Reader is able to insert data into processing queues within the configured intervals.
|
|
|
indicator:data_out_rate:description = This indicator reflects the number of consecutive times the Tail File Reader was unable to insert data into Splunk's processing queues for a period of 5 seconds. By default, this indicator becomes Yellow when this input stalls for 5 seconds, and Red after 10 seconds.
|
|
|
indicator:data_out_rate:yellow = 1
|
|
|
indicator:data_out_rate:red = 2
|
|
|
alert.disabled = 1
|
|
|
|
|
|
# Data Forwarding
|
|
|
[feature:s2s_autolb]
|
|
|
display_name = Auto Load Balanced TCP Output
|
|
|
friendly_description = Shows when this forwarder can’t connect to enough configured receivers.
|
|
|
indicator:s2s_connections:friendly_description = This forwarder is able to connect to enough receivers.
|
|
|
indicator:s2s_connections:description = This indicator gauges whether this forwarder can successfully connect to all indexers configured in outputs.conf. By default, this indicator becomes Yellow when 20% of indexers are unreachable, and Red at 70%.
|
|
|
indicator:s2s_connections:yellow = 20
|
|
|
indicator:s2s_connections:red = 70
|
|
|
|
|
|
# Indexer Clustering
|
|
|
[feature:cluster_bundles]
|
|
|
display_name = Cluster Bundles
|
|
|
friendly_description = Notifies when the index cluster configurations aren’t in sync due to a bundle validation failure.
|
|
|
indicator:cluster_bundles:friendly_description = Bundles are in sync. No validation errors in the last bundle push to cluster peers.
|
|
|
indicator:cluster_bundles:description = This indicator reflects whether there were validation errors in the last bundle that was pushed to cluster peers. You can disable this indicator by setting the threshold to 0.
|
|
|
indicator:cluster_bundles:yellow = 1
|
|
|
indicator:count_full_bundle_untar_last_10mins:friendly_description = All full bundle replications were successfully decompressed.
|
|
|
indicator:count_full_bundle_untar_last_10mins:description = This indicator counts the number of failures when decompressing the replicated full bundles in the last 10 minutes. By default, Yellow occurs when there are 2 failures, and red when there are 5 failures, within the last 10 minutes. Setting both thresholds to 0 will disable this indicator.
|
|
|
indicator:count_full_bundle_untar_last_10mins:yellow = 2
|
|
|
indicator:count_full_bundle_untar_last_10mins:red = 5
|
|
|
indicator:count_classic_bundle_timeout_last_10mins:friendly_description = All classic bundles are replicated timely.
|
|
|
indicator:count_classic_bundle_timeout_last_10mins:description = This indicator counts the number of times that the classic bundle replication lasts longer than 10 seconds in a 10 minute window. By default, Yellow occurs when the number of replication longer than 10 seconds is 2, and red when the number is 5, within the last 10 minues. Setting both thresholds to 0 will disable this indicator.
|
|
|
indicator:count_classic_bundle_timeout_last_10mins:yellow = 2
|
|
|
indicator:count_classic_bundle_timeout_last_10mins:red = 5
|
|
|
|
|
|
[feature:data_durability]
|
|
|
display_name = Data Durability
|
|
|
friendly_description = Notifies when an indexer cluster does not have enough copies of indexed data.
|
|
|
indicator:cluster_replication_factor:friendly_description = All indexed data has enough replicated copies.
|
|
|
indicator:cluster_replication_factor:description = This indicator reflects whether or not the configured replication factor is met for an indexer cluster. You can disable this indicator by setting the threshold to 0.
|
|
|
indicator:cluster_replication_factor:red = 1
|
|
|
indicator:cluster_search_factor:friendly_description = All indexed data has enough searchable copies.
|
|
|
indicator:cluster_search_factor:description = This indicator reflects whether or not the configured search factor is met for an indexer cluster. You can disable this indicator by setting the threshold to 0.
|
|
|
indicator:cluster_search_factor:red = 1
|
|
|
|
|
|
[feature:remote_storage_configuration]
|
|
|
display_name = Remote Storage Configuration
|
|
|
friendly_description = Notifies when there are remote indexes that have search factor less than replication factor.
|
|
|
indicator:s2_sf_rf:friendly_description = Search factor is equal to replication factor for all remote indexes.
|
|
|
indicator:s2_sf_rf:description = This indicator tracks whether the search factor is equal to the replication factor when remote indexes exist. Yellow occurs when there are remote indexes that have search factor less than replication factor. You can disable this indicator by setting the threshold to 0.
|
|
|
indicator:s2_sf_rf:yellow = 1
|
|
|
|
|
|
[feature:data_searchable]
|
|
|
display_name = Data Searchable
|
|
|
friendly_description = Notifies when some data in the cluster is not searchable.
|
|
|
indicator:data_searchable:friendly_description = All indexed data (buckets) have a primary, searchable copy.
|
|
|
indicator:data_searchable:description = This indicator reflects whether ALL indexed data in a cluster is available to be searched. Red occurs when one or more buckets of data lack a primary (searchable) copy. You can disable this indicator by setting the threshold to 0.
|
|
|
indicator:data_searchable:red = 1
|
|
|
|
|
|
[feature:indexers]
|
|
|
display_name = Indexers
|
|
|
friendly_description = Shows the state of the indexer cluster by warning when any peer isn’t in a healthy state.
|
|
|
indicator:detention:friendly_description = No indexer cluster peers are in detention mode.
|
|
|
indicator:detention:description = This indicator tracks whether any indexer cluster peers are in detention mode. Yellow occurs when not less than 'indicator:detention:yellow' number of peers are in manual detention, Red when not less than 'indicator:detention:red' number of peers are in automatic detention. You can disable this indicator by setting the threshold to 0.
|
|
|
indicator:detention:yellow = 1
|
|
|
indicator:detention:red = 1
|
|
|
indicator:missing_peers:friendly_description = No indexer cluster peers are down.
|
|
|
indicator:missing_peers:description = This indicator tracks whether any indexer cluster peers are in transition. Yellow occurs when not less than 'indicator:missing_peers:yellow' number of peers are in status like: stopping, stopped, decommissioning, pending or restarting, Red when not less than 'indicator:missing_peers:red' number of peers are down. You can disable this indicator by setting the threshold to 0.
|
|
|
indicator:missing_peers:yellow = 1
|
|
|
indicator:missing_peers:red = 1
|
|
|
indicator:cm_service_interval_invalid:friendly_description = The Cluster Manager's service interval is set to an acceptable value within the configured threshold.
|
|
|
indicator:cm_service_interval_invalid:description = This indicator checks whether the Cluster Manager's service_interval setting is configured to an unhealthy value that could cause disruption to the Indexer Cluster. By default, Yellow occurs when the interval is configured to more than 60 seconds. You can disable this indicator by setting the threshold to 0.
|
|
|
indicator:cm_service_interval_invalid:yellow = 60
|
|
|
|
|
|
[feature:indexing_ready]
|
|
|
display_name = Indexing Ready
|
|
|
friendly_description = Notifies when the cluster does not contain the minimum number of peers required for a working cluster.
|
|
|
indicator:indexing_ready:friendly_description = Enough peers have joined for Indexer Clustering to be operational.
|
|
|
indicator:indexing_ready:description = This indicator becomes Green when indexer clustering becomes functional. This happens when enough peers join the cluster. Once Green, this indicator stays Green until the cluster manager is restarted. You can disable this indicator by setting the threshold to 0.
|
|
|
indicator:indexing_ready:red = 1
|
|
|
|
|
|
# Search Head Clustering
|
|
|
[feature:peer_connectivity]
|
|
|
display_name = Peer Connectivity
|
|
|
friendly_description = Notifies when this cluster peer successfully connects to the cluster manager node.
|
|
|
indicator:peer_connectivity:friendly_description = Cluster peer is connected to the Cluster Manager.
|
|
|
indicator:peer_connectivity:description = This indicator reflects whether cluster peers can successfully connect to the cluster manager. If you are logged into a cluster peer, the indicator only reflects the status of the logged-in peer. Any failure results in Red. You can disable this indicator by setting the threshold to 0.
|
|
|
indicator:peer_connectivity:red = 1
|
|
|
|
|
|
[feature:replication_failures]
|
|
|
display_name = Replication Failures
|
|
|
friendly_description = Shows if this cluster peer experiences multiple consecutive bucket replication failures.
|
|
|
indicator:replication_failures:friendly_description = Replication of indexed data (buckets) is occurring successfully.
|
|
|
indicator:replication_failures:description = This indicator tracks whether the cluster peer is encountering repeated bucket replication failures. Yellow occurs after 5 consecutive failures, Red after 10.
|
|
|
indicator:replication_failures:red = 10
|
|
|
indicator:replication_failures:yellow = 5
|
|
|
|
|
|
[feature:searchheadconnectivity]
|
|
|
display_name = Search Head Connectivity
|
|
|
friendly_description = Notifies if search heads can connect to the indexer cluster manager node, and are running a version compatible with the manager node. If you are logged into a search head, the indicator only reflects the status of the logged-in member.
|
|
|
indicator:master_connectivity:friendly_description = Search head is connected to Cluster Manager.
|
|
|
indicator:master_connectivity:description = This indicator reflects whether or not this search head can successfully connect to the manager node. When Red, searches might be inaccurate due to outdated cluster information. You can disable this indicator by setting the threshold to 0.
|
|
|
indicator:master_connectivity:red = 1
|
|
|
indicator:master_version_compatibility:friendly_description = Cluster Manager and Search head versions are compatible.
|
|
|
indicator:master_version_compatibility:description = This indicator checks version compatibility between the manager node and search head. Yellow occurs when the manager node version is older than the search head version. You can disable this indicator by setting the threshold to 0.
|
|
|
indicator:master_version_compatibility:yellow = 1
|
|
|
indicator:searchhead_peer_connectivity:friendly_description = All search peers have connected to the search head.
|
|
|
indicator:searchhead_peer_connectivity:description = This indicator reflects whether or not there are search peers losing connection to search head. Yellow occurs when there are search peers lost connection to search head. You can disable this indicator by setting the threshold to 0.
|
|
|
indicator:searchhead_peer_connectivity:yellow = 1
|
|
|
|
|
|
[feature:shc_members_overview]
|
|
|
display_name = SHC Cluster Members
|
|
|
friendly_description = Shows the state of the search head cluster by warning when a number of cluster members aren’t in a healthy state.
|
|
|
indicator:status:friendly_description = All members of search head cluster are present.
|
|
|
indicator:status:description = This indicator tracks whether the required number of search head cluster members are up and running. Green occurs when all members are up, Yellow when 'indicator:status:yellow' members are down for less than (2* heartbeat_timeout) amount of time, and Red if 'indicator:status:red' members are down for more than (2*heartbeat_timeout) amount of time.
|
|
|
indicator:status:yellow = 1
|
|
|
indicator:status:red = 1
|
|
|
indicator:replication_factor:friendly_description = Sufficient number of search head cluster members exist to meet configured search artifact replication factor.
|
|
|
indicator:replication_factor:description = This indicator tracks whether enough search head cluster members exist to honor the configured search artifact replication factor. You can disable this indicator by setting the threshold to 0.
|
|
|
indicator:replication_factor:yellow = 1
|
|
|
indicator:detention:friendly_description = No search head cluster members are in detention.
|
|
|
indicator:detention:description = This indicator tracks whether any search head cluster members are in detention mode. Yellow occurs when not less than 'indicator:detention:yellow' number of members are in manual detention, Red when not less than 'indicator:detention:red' number of members are in automatic detention. Green occurs when no members are in manual/automatic detention.
|
|
|
indicator:detention:yellow = 1
|
|
|
indicator:detention:red = 1
|
|
|
|
|
|
[feature:shc_captain_election_overview]
|
|
|
display_name = SHC Captain Election
|
|
|
friendly_description = Notifies if the search head cluster contains enough nodes to elect a dynamic cluster captain.
|
|
|
indicator:dynamic_captain_quorum:friendly_description = Sufficient number of search heads are connected to perform captain elections.
|
|
|
indicator:dynamic_captain_quorum:description = This indicator tracks whether quorum majority required to re-elect a dynamic captain has been lost. Yellow occurs when half or more members are down, Green otherwise. This feature can be disabled when a static captain is being used instead of a dynamic captain. However, we recommend keeping the feature enabled if static captain is being used only for disaster recovery.
|
|
|
indicator:dynamic_captain_quorum:yellow = 1
|
|
|
|
|
|
[feature:shc_captain_connection]
|
|
|
display_name = Captain Connection
|
|
|
friendly_description = Notifies if this search head cluster member is connected to the Search head cluster captain, and if that captain is valid.
|
|
|
indicator:captain_connection:friendly_description = Search head cluster member is connected to the captain.
|
|
|
indicator:captain_connection:description = This indicator checks whether a search head cluster member is able to communicate with the captain or not. Red occurs when a member cannot communicate with the captain, green otherwise.
|
|
|
indicator:captain_connection:red = 1
|
|
|
indicator:captain_existence:friendly_description = A valid captain exists in the search head cluster.
|
|
|
indicator:captain_existence:description = This indicator checks for the existence of a valid captain in the search head cluster. Red occurs when there is no valid captain in the SHC, green otherwise.
|
|
|
indicator:captain_existence:red = 1
|
|
|
indicator:captain_bundle_replication:description = This indicator checks whether there are bundle replication failures which could potentially cause slowness in the Search Head.
|
|
|
indicator:captain_bundle_replication:friendly_description = The search head cluster member can successfully get the latest bundle from captain. You can disable this indicator by setting the threshold to 0.
|
|
|
indicator:captain_bundle_replication:yellow = 1
|
|
|
|
|
|
[feature:shc_captain_common_baseline]
|
|
|
display_name = Common Baseline
|
|
|
friendly_description = Notifies if the Search head cluster captain's baseline configurations are in sync across all search head cluster members.
|
|
|
indicator:common_baseline:friendly_description = Common baseline is established between captain and all search head cluster members.
|
|
|
indicator:common_baseline:description = This indicator checks whether the captain shares a common baseline with all the search head cluster members or not. This indicator is red if a shared baseline is missing between the captain and any of the members, green otherwise.
|
|
|
indicator:common_baseline:red = 1
|
|
|
|
|
|
[feature:shc_snapshot_creation]
|
|
|
display_name = Snapshot Creation
|
|
|
friendly_description = Shows if the search head cluster members are successfully creating their bundle snapshots.
|
|
|
indicator:snapshot_creation:friendly_description = Snapshots are occurring within the configured replication summary period.
|
|
|
indicator:snapshot_creation:description = This indicator checks whether snapshots were created on each search head cluster member within a reasonable time. This indicator is green if snapshot creation happens in less than (indicator:snapshot_creation:yellow x conf_replication_summary.period) minutes, yellow if snapshot creation takes between (indicator:snapshot_creation:yellow * conf_replication_summary.period) and (indicator:snapshot_creation:red * conf_replication_summary.period) minutes, and red if it takes more than (indicator:snapshot_creation:red * conf_replication_summary.period) minutes.
|
|
|
indicator:snapshot_creation:yellow = 10
|
|
|
indicator:snapshot_creation:red = 20
|
|
|
|
|
|
[feature:peer_state]
|
|
|
display_name = Peer State
|
|
|
friendly_description = Shows when an index cluster peer node isn’t in a healthy state.
|
|
|
indicator:peer_state:friendly_description = No cluster peers are in detention.
|
|
|
indicator:peer_state:description = This indicator gauges whether the cluster peer is in an abnormal state. For example, manual detention will result in Yellow, and automatic detention will result in Red. You can disable this indicator by setting the threshold to 0.
|
|
|
indicator:peer_state:red = 1
|
|
|
indicator:peer_state:yellow = 1
|
|
|
|
|
|
[feature:peer_version]
|
|
|
display_name = Peer Version
|
|
|
friendly_description = Notifies when an indexer cluster peer node is running a version incompatible with the manager node.
|
|
|
indicator:peer_version:friendly_description = Cluster manager version is equal or greater than cluster peer version.
|
|
|
indicator:peer_version:description = This indicator checks version compatibility between the cluster manager and cluster peer. Red occurs when the cluster manager version is older than the cluster peer version.
|
|
|
indicator:peer_version:red = 1
|
|
|
|
|
|
# Index Processor
|
|
|
[feature:splunkoptimize_processes]
|
|
|
display_name = Bucket Optimization
|
|
|
friendly_description = Notifies when this indexer is falling behind with index bucket optimization tasks.
|
|
|
indicator:concurrent_optimize_processes_percent:friendly_description = Number of running optimization jobs is below the allowed maximum.
|
|
|
indicator:concurrent_optimize_processes_percent:description = This indicator tracks whether index optimization is falling behind. By default, this indicator becomes Yellow when 100% of the maximum allowed "splunk-optimize" processes are running.
|
|
|
indicator:concurrent_optimize_processes_percent:yellow = 100
|
|
|
|
|
|
[feature:buckets]
|
|
|
display_name = Buckets
|
|
|
friendly_description = Shows when index buckets are created at a size or rate that can impact search performance.
|
|
|
indicator:buckets_created_last_60m:friendly_description = In the last 60 minutes, the number of index buckets created is under the configured threshold.
|
|
|
indicator:buckets_created_last_60m:description = This indicator gauges whether incoming data is being appropriately bucketed within the Splunk index. By default, Red occurs when any index has created more than 60 buckets within the last hour. A high rate of bucket creation can cause severe search performance degradation, and might indicate poorly configured data processing (for example, timestamping).
|
|
|
indicator:buckets_created_last_60m:red = 60
|
|
|
indicator:buckets_created_last_60m:yellow = 40
|
|
|
indicator:percent_small_buckets_created_last_24h:friendly_description = Most or all buckets created over the last 24 hours are over the configured threshold.
|
|
|
indicator:percent_small_buckets_created_last_24h:description = This indicator tracks the percentage of small buckets created over the last 24 hours. A small bucket is defined as less than 10 % of the ‘maxDataSize’ setting in indexes.conf.
|
|
|
indicator:percent_small_buckets_created_last_24h:red = 30
|
|
|
indicator:percent_small_buckets_created_last_24h:yellow = 15
|
|
|
indicator:gigantic_bucket_size:friendly_description = All of the created buckets are within the configured size threshold.
|
|
|
indicator:gigantic_bucket_size:description = This indicator tracks the buckets with sizes that exceeds the configured threshold in megabytes. A large bucket can cause high memory usage and performance degradation. By default, Yellow occurs when there are one or more buckets exceeding 20,000 MB, and Red when there are one or more buckets exceeding 50,000 MB. Setting both thresholds to 0 will disable this indicator.
|
|
|
indicator:gigantic_bucket_size:red = 50000
|
|
|
indicator:gigantic_bucket_size:yellow = 20000
|
|
|
indicator:count_bucket_rename_failure_last_10mins:friendly_description = All the bucket rename during hot bucket rolling had been successful.
|
|
|
indicator:count_bucket_rename_failure_last_10mins:description = This indicator tracks whether the number of failures of bucket renames during hot bucket rolling is greater than the threshold. By default, Yellow occurs when there are 2 failures, and Red when there are 5 failures. Setting both thresholds to 0 will disable this indicator.
|
|
|
indicator:count_bucket_rename_failure_last_10mins:red = 5
|
|
|
indicator:count_bucket_rename_failure_last_10mins:yellow = 2
|
|
|
|
|
|
[feature:disk_space]
|
|
|
display_name = Disk Space
|
|
|
friendly_description = Shows if the filesystems have too little free space to continue operating.
|
|
|
indicator:disk_space_remaining_multiple_minfreespace:friendly_description = All Splunk index filesystems have sufficient space based on minimum Free Space setting.
|
|
|
indicator:disk_space_remaining_multiple_minfreespace:description = This indicator tracks whether all Splunk index filesystems contain sufficient free space to continue indexing. This calculation is based upon the 'minFreeSpace' setting in server.conf. By default, Yellow occurs when a filesystem's free space falls below (2* 'minFreeSpace'), and Red occurs when it falls below 'minFreeSpace'. If the index being reported on is a remote s2-enabled index, by default, Yellow occurs when a filesystem's free space falls below (1 * 'minFreeSpace'), and Red occurs when a filesystem's free space drops to 0.
|
|
|
indicator:disk_space_remaining_multiple_minfreespace:red = 1
|
|
|
indicator:disk_space_remaining_multiple_minfreespace:yellow = 2
|
|
|
indicator:max_volume_size_invalid:friendly_description = The setting maxVolumeDataSizeMB is not used in any remote storage volumes.
|
|
|
indicator:max_volume_size_invalid:description = The indicator tracks volumes with remote storage that use the maxVolumeDataSizeMB setting. The use of maxVolumeDataSizeMB will be ignored in these remote storage volumes. You can disable this indicator by setting the threshold to 0.
|
|
|
indicator:max_volume_size_invalid:yellow = 1
|
|
|
|
|
|
[feature:smart_storage]
|
|
|
display_name = Smart Storage
|
|
|
indicator:smart_storage_localize_on_time:friendly_description = Number of S2 buckets localization timeout is below the allowed threshold.
|
|
|
indicator:smart_storage_localize_on_time:description = This indicator tracks whether the S2 buckets are localized in a timely manner. By default, Yellow occurs when the number of timeout for the bucket localization is 10. You can disable this indicator by setting the threshold to 0.
|
|
|
indicator:smart_storage_localize_on_time:yellow = 10
|
|
|
|
|
|
# Dynamic Data Active Archive
|
|
|
[feature:ddaa_archived_buckets]
|
|
|
display_name = Dynamic Data Archived Buckets
|
|
|
friendly_description = Shows if there are multiple failures while archiving buckets into AWS Glacier.
|
|
|
indicator:archived_buckets_failed_last_24h:friendly_description = Number of failed attempts to archive buckets under threshold within 24 hours.
|
|
|
indicator:archived_buckets_failed_last_24h:description = This indicator tracks the amount of buckets that were attempted to be archived into Glacier but failed. Green occurs when less than 40 buckets in the last 24 hours have failed, yellow occurs when 3 or more buckets have failed, and red occurs when 80 or more buckets have failed.
|
|
|
indicator:archived_buckets_failed_last_24h:yellow = 40
|
|
|
indicator:archived_buckets_failed_last_24h:red = 80
|
|
|
|
|
|
[feature:searches_skipped]
|
|
|
display_name = Searches Skipped in the last 24 hours
|
|
|
friendly_description = Shows the load on the search scheduler by warning when there are a number of scheduled search attempts that the search scheduler could not start, typically due to long-running or excessive number of searches. For example, a specific scheduled search that is skipped 10 times will contribute to this indicator 10 times.
|
|
|
indicator:percent_searches_skipped_high_priority_last_24h:friendly_description = Most or all high-priority scheduled search attempts during the last 24 hours were successfully started.
|
|
|
indicator:percent_searches_skipped_high_priority_last_24h:description = This indicator tracks the skip rate for high priority scheduled searches. These are scheduled searches where the priority field is set to "higher" or "highest". By default, this indicator is yellow if the skipped search ratio over the last 24 hours is 5%, and red if it is 10%.
|
|
|
indicator:percent_searches_skipped_high_priority_last_24h:yellow = 5
|
|
|
indicator:percent_searches_skipped_high_priority_last_24h:red = 10
|
|
|
indicator:percent_searches_skipped_non_high_priority_last_24h:friendly_description = Most or all default-priority scheduled searches during the last 24 hours were successfully started.
|
|
|
indicator:percent_searches_skipped_non_high_priority_last_24h:description = This indicator tracks the skip rate for scheduled searches whose priority field is set to "default". By default, this indicator is yellow if the skipped search ratio over the last 24 hours is 10%, and red if it is 20%.
|
|
|
indicator:percent_searches_skipped_non_high_priority_last_24h:yellow = 10
|
|
|
indicator:percent_searches_skipped_non_high_priority_last_24h:red = 20
|
|
|
tree_view:health_subset = enabled
|
|
|
|
|
|
[feature:searches_delayed]
|
|
|
display_name = Search Scheduler Searches Delayed
|
|
|
friendly_description = Shows the load on the search scheduler by warning when there are a number of scheduled search attempts that the search scheduler is delaying. For example, if the scheduler delays a specific scheduled search 10 times, those delays will contribute to this indicator 10 times.
|
|
|
indicator:percent_searches_delayed_high_priority_last_24h:friendly_description = Percent of high priority scheduled search attempts that are delayed vs. non-high priority scheduled search attempts delayed in the last 24 hours.
|
|
|
indicator:percent_searches_delayed_high_priority_last_24h:description = This indicator tracks the delayed search rate for high priority scheduled searches. These are scheduled searches where the priority field is set to "higher" or "highest". By default, this indicator is yellow if the delayed search ratio over the last 24 hours is 5%, and red if it is 10%.
|
|
|
indicator:percent_searches_delayed_high_priority_last_24h:yellow = 5
|
|
|
indicator:percent_searches_delayed_high_priority_last_24h:red = 10
|
|
|
indicator:percent_searches_delayed_non_high_priority_last_24h:friendly_description = Percent of default priority scheduled search attempts that are delayed vs. all delayed search attempts in the last 24 hours.
|
|
|
indicator:percent_searches_delayed_non_high_priority_last_24h:description = This indicator tracks the delayed search rate for scheduled searches whose priority field is set to "default". By default, this indicator is yellow if the delayed search ratio over the last 24 hours is 10%, and red if it is 20%.
|
|
|
indicator:percent_searches_delayed_non_high_priority_last_24h:yellow = 10
|
|
|
indicator:percent_searches_delayed_non_high_priority_last_24h:red = 20
|
|
|
tree_view:health_subset = enabled
|
|
|
|
|
|
[feature:search_lag]
|
|
|
display_name = Search Scheduler Search Lag
|
|
|
friendly_description = Shows when scheduled searches are unable to start on-time, due to long-running or excessive number of searches.
|
|
|
indicator:percent_searches_lagged_high_priority_last_24h:friendly_description = Most or all high-priority scheduled searches over the last 24 hours started on-time.
|
|
|
indicator:percent_searches_lagged_high_priority_last_24h:description = This indicator tracks the lag rate of high priority scheduled searches. Search lag is a delay that occurs before a search starts. High priority scheduled searches are scheduled searches whose priority is set to "higher" or "highest". By default, this indicator is yellow if the search lag rate exceeds 10%.
|
|
|
indicator:percent_searches_lagged_high_priority_last_24h:yellow = 10
|
|
|
indicator:percent_searches_lagged_non_high_priority_last_24h:friendly_description = Most or all default-priority scheduled searches over the last 24 hours started on-time.
|
|
|
indicator:percent_searches_lagged_non_high_priority_last_24h:description = This indicator tracks the lag rate for scheduled searches whose priority is set to "default". Search lag is a delay that occurs before a search starts. By default, this indicator is yellow if the search lag rate over the last 24 hours exceeds 40%.
|
|
|
indicator:percent_searches_lagged_non_high_priority_last_24h:yellow = 40
|
|
|
indicator:count_extremely_lagged_searches_last_hour:friendly_description = All scheduled searches during the last hour were able to start within their configured interval.
|
|
|
indicator:count_extremely_lagged_searches_last_hour:description = This indicator checks whether there are any extremely lagged searches in the last hour. These are scheduled searches which have been unable to run within their configured interval. By default, this indicator is never Yellow, and is Red if there is at least 1 extremely lagged scheduled search. This is a numeric threshold rather than a percentage. Setting both thresholds to 0 will disable this indicator.
|
|
|
indicator:count_extremely_lagged_searches_last_hour:yellow = 0
|
|
|
indicator:count_extremely_lagged_searches_last_hour:red = 1
|
|
|
tree_view:health_subset = enabled
|
|
|
|
|
|
[feature:scheduler_suppression]
|
|
|
display_name = Scheduler Suppression
|
|
|
indicator:suppression_list_oversized:friendly_description = The size of the suppression list is within limit.
|
|
|
indicator:suppression_list_oversized:description = The indicator tracks the size of the suppression list. By default, Yellow occurs when there are more than 1,000,000 rows in the suppression list file. You can disable this indicator by setting the threshold to 0.
|
|
|
indicator:suppression_list_oversized:yellow = 1000000
|
|
|
|
|
|
[feature:wlm_system_check]
|
|
|
display_name = System Check
|
|
|
friendly_description = Notifies if the Linux OS used by this node supports workload management.
|
|
|
indicator:system_check:friendly_description = Linux OS is set up properly for workload management.
|
|
|
indicator:system_check:description = This indicator checks whether the underlying Linux operating system is set up properly for workload management.
|
|
|
indicator:system_check:red = 1
|
|
|
|
|
|
[feature:wlm_configuration_check]
|
|
|
display_name = Configuration Check
|
|
|
friendly_description = Notifies if the workload management configuration on this node is valid.
|
|
|
indicator:configuration_check:friendly_description = Workload management configurations are valid.
|
|
|
indicator:configuration_check:description = This indicator checks whether the workload management configuration, including pools and rules, is valid.
|
|
|
indicator:configuration_check:yellow = 1
|
|
|
indicator:configuration_check:red = 2
|
|
|
|
|
|
[feature:admission_rules_check]
|
|
|
display_name = Admission Rules Check
|
|
|
friendly_description = Notifies if the workload management admission rules configuration on this node is valid.
|
|
|
indicator:admission_rules_check:friendly_description = Workload management admission rule configurations are valid.
|
|
|
indicator:admission_rules_check:description = This indicator checks whether the workload management admission rules configurations are valid.
|
|
|
indicator:admission_rules_check:yellow = 1
|
|
|
indicator:admission_rules_check:red = 2
|
|
|
|
|
|
[feature:ingestion_latency]
|
|
|
display_name = Ingestion Latency
|
|
|
friendly_description = Shows the state of the indexer by warning when there's a delay in processing log events.
|
|
|
indicator:ingestion_latency_lag_sec:friendly_description = Data is being ingested in real-time. This is verified by comparing time of log event creation and time of indexing completion for synthetic log events generated on this indexer.
|
|
|
indicator:ingestion_latency_lag_sec:description = This indicator tracks the difference between the logging time and processing time of locally generated events to determine ingestion latency. By default, this indicator will turn Yellow at 15 seconds of latency, and Red if the latency reaches 180 seconds. Setting both values to 0 will disable this indicator.
|
|
|
indicator:ingestion_latency_lag_sec:yellow = 15
|
|
|
indicator:ingestion_latency_lag_sec:red = 180
|
|
|
indicator:ingestion_latency_gap_multiplier:friendly_description = Data is being ingested in real-time. This is verified by ensuring that synthetic log events generated on this indexer are ingested on a regular basis.
|
|
|
indicator:ingestion_latency_gap_multiplier:description = This indicator tracks locally generated events, and uses the time elapsed since the last event was ingested to determine the ingestion latency gap. By default, this indicator will turn Yellow when the latency gap reaches 45 seconds, and Red at 210 seconds. To calculate the warning value, multiple these values by their corresponding ingestion_latency_lag_sec and then add 30. Setting both values to 0 will disable this indicator.
|
|
|
indicator:ingestion_latency_gap_multiplier:yellow = 1
|
|
|
indicator:ingestion_latency_gap_multiplier:red = 1
|
|
|
|
|
|
[feature:ingestion_latency_reported]
|
|
|
display_name = Forwarder Ingestion Latency
|
|
|
friendly_description = Sufficient number of forwarders are ingesting data in real time.
|
|
|
indicator:ingestion_latency_indexer_health:friendly_description = Tracks ratio of forwarders that are experiencing ingestion latency vs. total forwarders to determine aggregate health.
|
|
|
indicator:ingestion_latency_indexer_health:description = This indicator tracks aggregated health of ingestion latencies as reported by forwarders. Thresholds are in percents, once 100*(number of forwarders reporting yellow + red colors)/(total number of forwarders) exceeds yellow threshold the indexer will report aggregated color as yellow color. Once 100*(number of forwarders reporting red colors)/(total number of forwarders) exceeds red threshold the indexer will report aggregated color as red color.
|
|
|
indicator:ingestion_latency_indexer_health:yellow = 1
|
|
|
indicator:ingestion_latency_indexer_health:red = 1
|
|
|
|
|
|
# HEC Health
|
|
|
[feature:hec_health_status]
|
|
|
display_name = HEC health status
|
|
|
friendly_description = Tracks HEC healthiness status
|
|
|
indicator:hec_health:friendly_description = This indicator tracks the current capacity of HEC (Http Event Collector) related resources, mainly queue length, for example the http input queue and ack queue.
|
|
|
indicator:hec_health:description = This indicator tracks the percentage of resources that HEC uses. The Splunk platform calculates the indicator as (current resource usage) > (maximum resource usage * threshold / 100). The indicator reports as yellow when it exceeds 50 percent and red when it exceeds 80 percent.
|
|
|
indicator:hec_health:yellow = 50
|
|
|
indicator:hec_health:red = 80
|
|
|
|
|
|
[feature:iowait]
|
|
|
display_name = IOWait
|
|
|
friendly_description = Shows if there's a delay in disk I/O requests that can impact indexing and search performance on this node.
|
|
|
indicator:avg_cpu__max_perc_last_3m:friendly_description = Average time spent waiting for I/O, across all CPUs, is under threshold during the last 3 minutes.
|
|
|
indicator:avg_cpu__max_perc_last_3m:description = This indicator tracks the average IOWait percentage across all CPUs on the machine running the Splunk Enterprise instance, over the last 3 minute window. By default, this indicator will turn Yellow if the percentage exceeds 1% and Red if it exceeds 3% during this window.
|
|
|
indicator:avg_cpu__max_perc_last_3m:red = 3
|
|
|
indicator:avg_cpu__max_perc_last_3m:yellow = 1
|
|
|
indicator:single_cpu__max_perc_last_3m:friendly_description = No individual CPU spent an excessive amount of time waiting for I/O during the last 3 minutes.
|
|
|
indicator:single_cpu__max_perc_last_3m:description = This indicator tracks the IOWait percentage for the single most bottle-necked CPU on the machine running the Splunk Enterprise instance, over the last 3 minute window. By default, this indicator will turn Yellow if the percentage exceeds 5% and Red if it exceeds 10% during this window.
|
|
|
indicator:single_cpu__max_perc_last_3m:red = 10
|
|
|
indicator:single_cpu__max_perc_last_3m:yellow = 5
|
|
|
indicator:sum_top3_cpu_percs__max_last_3m:friendly_description = IOWait of the 3 busiest CPUs has not exceeded the configured threshold during the last 3 minutes.
|
|
|
indicator:sum_top3_cpu_percs__max_last_3m:description = This indicator tracks the sum of IOWait percentage for the three most bottle-necked CPUs on the machine running the Splunk Enterprise instance, over the last 3 minute window. By default, this indicator will turn Yellow if the sum exceeds 7 and Red if it exceeds 15 during this window.
|
|
|
indicator:sum_top3_cpu_percs__max_last_3m:red = 15
|
|
|
indicator:sum_top3_cpu_percs__max_last_3m:yellow = 7
|
|
|
|
|
|
[feature:ingest_actions_output]
|
|
|
display_name = Ingest Actions Output
|
|
|
friendly_description = Indicates whether Ingest Actions is able to successfully route data to destinations.
|
|
|
indicator:output_rate:friendly_description = Indicates that Ingest Actions is able to insert data into Splunk's processing queue.
|
|
|
indicator:output_rate:description = This indicator reflects the number of consecutive times the rfsoutput processor was unable to insert data into Splunk's processing queue. By default, this indicator becomes yellow after consecutive insertion failures for 5 seconds, and red after 10 seconds. Setting both thresholds to 0 will disable this indicator.
|
|
|
indicator:output_rate:yellow = 5
|
|
|
indicator:output_rate:red = 10
|
|
|
indicator:write_failure:friendly_description = Indicates that Ingest Actions is able to upload data to destinations successfully.
|
|
|
indicator:write_failure:description = The indicator reflects the number of consecutive times the rfsoutput worker was was unable to upload data to an external destination. By default, this indicator becomes yellow after 2 consecutive write failures, red after 10 consecutive failures. Setting both thresholds to 0 will disable this indicator.
|
|
|
indicator:write_failure:yellow = 2
|
|
|
indicator:write_failure:red = 10
|
|
|
tree_view:health_subset = enabled
|
|
|
|