Puppet Plan: pe_status_check::agent_state_summary

Defined in:
plans/agent_state_summary.pp

Summary

provides an overview of all Puppet agents and their error states

Overview

Parameters:

  • runinterval (Integer[0]) (defaults to: 30)

    the runinterval for the Puppet Agent in minutes. We consider latest reports that are older than runinterval as unresponsive

  • log_healthy_nodes (Boolean) (defaults to: false)

    optionally return all healthy nodes, not only the unhealthy

  • log_unhealthy_nodes (Boolean) (defaults to: true)

    optionally hide unhealthy nodes

Author:

  • Tim Meusel <tim@bastelfreak.de>



10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# File 'plans/agent_state_summary.pp', line 10

plan pe_status_check::agent_state_summary (
  Integer[0] $runinterval = 30,
  Boolean $log_healthy_nodes = false,
  Boolean $log_unhealthy_nodes = true,
){
  # a list of all nodes and their latest catalog state
  $nodes = puppetdb_query('nodes[certname,latest_report_noop,latest_report_corrective_change,cached_catalog_status,latest_report_status,report_timestamp]{}')
  $fqdns = $nodes.map |$node| { $node['certname'] }

  # check if the last catalog is older than X minutes
  $current_timestamp = Integer(Timestamp().strftime('%s'))
  $runinterval_seconds = $runinterval * 60
  $unresponsive = $nodes.map |$node| {
    $old_timestamp = Integer(Timestamp($node['report_timestamp']).strftime('%s'))
    if ($current_timestamp - $old_timestamp) >= $runinterval_seconds {
      $node['certname']
    }
  }.filter |$node| { $node =~ NotUndef }

  # all nodes that delivered a report in time
  $responsive = $fqdns - $unresponsive

  # all nodes that used noop for the last catalog
  $noop = $nodes.map |$node| { if ($node['latest_report_noop'] == true){ $node['certname'] } }.filter |$node| { $node =~ NotUndef }

  # all nodes that reported corrective changes
  $corrective_changes = $nodes.map |$node| { if ($node['latest_report_corrective_change'] == true){ $node['certname'] } }.filter |$node| { $node =~ NotUndef }

  # all nodes that used a cached catalog on the last run
  $used_cached_catalog = $nodes.map |$node| { if ($node['cached_catalog_status'] != 'not_used'){ $node['certname'] } }.filter |$node| { $node =~ NotUndef }

  # all nodes with failed resources in the last report
  $failed = $nodes.map |$node| { if ($node['latest_report_status'] == 'failed'){ $node['certname'] } }.filter |$node| { $node =~ NotUndef }

  # all nodes with changes in the last report
  $changed = $nodes.map |$node| { if ($node['latest_report_status'] == 'changed'){ $node['certname'] } }.filter |$node| { $node =~ NotUndef }

  # all nodes that aren't healthy in any form
  $unhealthy = [$noop, $corrective_changes, $used_cached_catalog, $failed, $changed, $unresponsive].flatten.unique

  # all healthy nodes
  $healthy = $fqdns - $unhealthy

  $data = if $log_unhealthy_nodes {
    {
      'noop'                => $noop,
      'corrective_changes'  => $corrective_changes,
      'used_cached_catalog' => $used_cached_catalog,
      'failed'              => $failed,
      'changed'             => $changed,
      'unresponsive'        => $unresponsive,
      'responsive'          => $responsive,
      'unhealthy'           => $unhealthy,
      'unhealthy_counter'   => $unhealthy.count,
      'healthy_counter'     => $healthy.count,
      'total_counter'       => $fqdns.count,
    }
  } else {
    {
      'unhealthy_counter' => $unhealthy.count,
      'healthy_counter'   => $healthy.count,
      'total_counter'     => $fqdns.count,
    }
  }

  return if $log_healthy_nodes {
    $data + { 'healthy' => $healthy }
  } else {
    $data
  }
}