{"plugin": "custom","pluginConfig": {"invoke_interval": "1m","timeout": "1m","max_output_length": 80,"concurrency": 1,"skip_initial_status": true},"source": "journalctl-custom-monitor","metricsReporting": true,"conditions": [{"type": "PLEGisUnhealthy","reason": "PLEGisHealthy","message": "PLEG is functioning properly"}],"rules": [{"type": "permanent","condition": "PLEGisUnhealthy","reason": "PLEGisUnhealthy",// Here we use a custom alternative to logcounter that comes with NPD// as our version runs much faster on large logs"path": "/home/kubernetes/bin/journalcounter","args": ["--identifier=kubelet","--lookback=10m","--count=3","--pattern=PLEG is not healthy: pleg was last seen active"],"timeout": "1m"}]}
package mainimport ("fmt""net""os""strings""time")const TIMEOUT = 2 * time.Secondfunc checkTCPConnect(endpoints []string) (bool, string) {errors := 0for _, endpoint := range endpoints {parts := strings.Split(endpoint, ":")if len(parts) != 2 {return false, fmt.Sprintf("INVALID ENDPOINT FORMAT: %s", endpoint)}conn, err := net.DialTimeout("tcp", endpoint, TIMEOUT)if err != nil {errors++continue}defer conn.Close()}endpointString := strings.Join(endpoints, ", ")if errors == len(endpoints) {// We use uppercase writing to make errors more noticeable among node conditionsreturn false, fmt.Sprintf("TIMEOUT TO ENDPOINTS: %s", strings.ToUpper(endpointString))}return true, fmt.Sprintf("connected to at least one endpoint: %s", endpointString)}func main() {if len(os.Args) < 2 {fmt.Println("Usage: tcp-connect address1:port1 address2:port2 ...")os.Exit(1)}endpoints := os.Args[1:]result, msg := checkTCPConnect(endpoints)fmt.Println(msg)if !result {os.Exit(1)}}
package mainimport ("bufio""errors""fmt""net""os""reflect""regexp""strings""time"log "github.com/sirupsen/logrus")// Timeout for querying BIRD.var birdTimeOut = 4 * time.Second// Expected BIRD protocol table columnsvar birdExpectedHeadings = []string{"name", "proto", "table", "state", "since", "info"}// bgpPeer is a structure containing details about a BGP peertype bgpPeer struct {PeerIP stringPeerType stringState stringSince stringBGPState stringInfo string}// Check for Word_<IP> where every octate is separated by "_", regardless of IP protocols// Example match: "Mesh_192_168_56_101" or "Mesh_fd80_24e2_f998_72d7__2"var bgpPeerRegex = regexp.MustCompile(`^(Global|Node|Mesh)_(.+)$`)// Mapping the BIRD/GoBGP type extracted from the peer name to the display typevar bgpTypeMap = map[string]string{"Global": "global","Mesh": "node-to-node mesh","Node": "node specific",}func checkBGPPeers() (bool, string) {// Show debug messages// log.SetLevel(log.DebugLevel)// Try connecting to the bird socket in `/var/run/calico/` first to get the datac, err := net.Dial("unix", "/var/run/calico/bird.ctl")if err != nil {// If that fails, try connecting to bird socket in `/var/run/bird` (which is the// default socket location for bird install) for non-containerized installsc, err = net.Dial("unix", "/var/run/bird/bird.ctl")if err != nil {return false, "ERROR: UNABLE TO OPEN BIRD SOCKET"}}defer c.Close()// To query the current state of the BGP peers, we connect to the BIRD// socket and send a "show protocols" message. BIRD responds with// peer data in a table format//// Send the request_, err = c.Write([]byte("show protocols\n"))if err != nil {return false, "UNABLE TO WRITE TO BIRD SOCKET"}// Scan the output and collect parsed BGP peerspeers, err := scanBIRDPeers(c)if err != nil {// If "read unix @->/var/run/calico/bird.ctl: i/o timeout" then skip check// This error usually means that it is very high LA on nodeif netErr, ok := err.(net.Error); ok && netErr.Timeout() {return true, fmt.Sprintf("Skipping because of: %v", err)} else {return false, fmt.Sprintf("ERROR: %v", err)}}// If no peers were returned then just print a messageif len(peers) == 0 {return false, "CALICO HAS NO BGP PEERS"}for _, peer := range peers {log.Debugf(peer.PeerIP, peer.BGPState)if peer.BGPState == "Established" {return true, "calico bird have at least one peer with established connection"}}return false, "NO CONNECTION TO BGP PEERS"}func scanBIRDPeers(conn net.Conn) ([]bgpPeer, error) {ipSep := "."// The following is sample output from BIRD//// 0001 BIRD 1.5.0 ready.// 2002-name proto table state since info// 1002-kernel1 Kernel master up 2016-11-21// device1 Device master up 2016-11-21// direct1 Direct master up 2016-11-21// Mesh_172_17_8_102 BGP master up 2016-11-21 Established// 0000scanner := bufio.NewScanner(conn)peers := []bgpPeer{}// Set a time-out for reading from the socket connectionerr := conn.SetReadDeadline(time.Now().Add(birdTimeOut))if err != nil {return nil, errors.New("failed to set time-out")}for scanner.Scan() {// Process the next line that has been read by the scannerstr := scanner.Text()log.Debug(str)if strings.HasPrefix(str, "0000") {// "0000" means end of databreak} else if strings.HasPrefix(str, "0001") {// "0001" code means BIRD is ready} else if strings.HasPrefix(str, "2002") {// "2002" code means start of headingsf := strings.Fields(str[5:])if !reflect.DeepEqual(f, birdExpectedHeadings) {return nil, errors.New("unknown BIRD table output format")}} else if strings.HasPrefix(str, "1002") {// "1002" code means first row of datapeer := bgpPeer{}if peer.unmarshalBIRD(str[5:], ipSep) {peers = append(peers, peer)}} else if strings.HasPrefix(str, " ") {// Row starting with a " " is another row of datapeer := bgpPeer{}if peer.unmarshalBIRD(str[1:], ipSep) {peers = append(peers, peer)}} else {// Format of row is unexpectedreturn nil, errors.New("unexpected output line from BIRD")}// Before reading the next line, adjust the time-out for// reading from the socket connectionerr = conn.SetReadDeadline(time.Now().Add(birdTimeOut))if err != nil {return nil, errors.New("failed to adjust time-out")}}return peers, scanner.Err()}// Unmarshal a peer from a line in the BIRD protocol output. Returns true if// successful, false otherwisefunc (b *bgpPeer) unmarshalBIRD(line, ipSep string) bool {columns := strings.Fields(line)if len(columns) < 6 {log.Debug("Not a valid line: fewer than 6 columns")return false}if columns[1] != "BGP" {log.Debug("Not a valid line: protocol is not BGP")return false}// Check the name of the peer is of the correct format. This regex// returns two components:// - A type (Global|Node|Mesh) which we can map to a display type// - An IP address (with _ separating the octets)sm := bgpPeerRegex.FindStringSubmatch(columns[0])if len(sm) != 3 {log.Debugf("Not a valid line: peer name '%s' is not correct format", columns[0])return false}var ok boolb.PeerIP = strings.Replace(sm[2], "_", ipSep, -1)if b.PeerType, ok = bgpTypeMap[sm[1]]; !ok {log.Debugf("Not a valid line: peer type '%s' is not recognized", sm[1])return false}// Store remaining columns (piecing back together the info string)b.State = columns[3]b.Since = columns[4]b.BGPState = columns[5]if len(columns) > 6 {b.Info = strings.Join(columns[6:], " ")}return true}func main() {var message stringvar result boolresult, message = checkBGPPeers()fmt.Println(message)if !result {os.Exit(1)}}
При проектировании проверок я исходил из того, что они должны отвечать одному из следующих требований: либо однозначно детектировать известную нам проблему, либо давать ясное представление о каком-либо состоянии.