From 93fdd1c012ca64470829fbf68bb5bb6193907c35 Mon Sep 17 00:00:00 2001 From: Audrius Butkevicius Date: Fri, 27 Jul 2018 06:59:55 +0100 Subject: [PATCH] cmd/strelaypoolsrv: Prevent scraped metrics moving backwards (#5068) --- cmd/strelaypoolsrv/main.go | 2 +- cmd/strelaypoolsrv/stats.go | 41 ++++++++++++++++++++++++++++++-- cmd/strelaypoolsrv/stats_test.go | 21 ++++++++++++++++ 3 files changed, 61 insertions(+), 3 deletions(-) create mode 100644 cmd/strelaypoolsrv/stats_test.go diff --git a/cmd/strelaypoolsrv/main.go b/cmd/strelaypoolsrv/main.go index 9784c0217..7f72a88d6 100644 --- a/cmd/strelaypoolsrv/main.go +++ b/cmd/strelaypoolsrv/main.go @@ -496,7 +496,7 @@ func handleRelayTest(request request) { mut.Lock() if stats != nil { - updateMetrics(request.relay.uri.Host, stats, location) + updateMetrics(request.relay.uri.Host, *stats, location) } request.relay.Stats = stats request.relay.StatsRetrieved = time.Now() diff --git a/cmd/strelaypoolsrv/stats.go b/cmd/strelaypoolsrv/stats.go index 713141f5d..5fa8c047e 100644 --- a/cmd/strelaypoolsrv/stats.go +++ b/cmd/strelaypoolsrv/stats.go @@ -44,6 +44,8 @@ var ( relayGlobalRate = makeGauge("relay_global_rate", "Global rate applied on the whole relay", "relay") relayBuildInfo = makeGauge("relay_build_info", "Build information about a relay", "relay", "go_version", "go_os", "go_arch") relayLocationInfo = makeGauge("relay_location_info", "Location information about a relay", "relay", "city", "country", "continent") + + lastStats = make(map[string]stats) ) func makeGauge(name string, help string, labels ...string) *prometheus.GaugeVec { @@ -142,7 +144,7 @@ func refreshStats() { if result.stats == nil { deleteMetrics(result.relay.uri.Host) } else { - updateMetrics(result.relay.uri.Host, result.stats, result.relay.Location) + updateMetrics(result.relay.uri.Host, *result.stats, result.relay.Location) } } mut.Unlock() @@ -182,13 +184,18 @@ func fetchStats(relay *relay) *stats { return &stats } -func updateMetrics(host string, stats *stats, location location) { +func updateMetrics(host string, stats stats, location location) { if stats.GoVersion != "" || stats.GoOS != "" || stats.GoArch != "" { relayBuildInfo.WithLabelValues(host, stats.GoVersion, stats.GoOS, stats.GoArch).Add(1) } if location.City != "" || location.Country != "" || location.Continent != "" { relayLocationInfo.WithLabelValues(host, location.City, location.Country, location.Continent).Add(1) } + + if lastStat, ok := lastStats[host]; ok { + stats = mergeStats(stats, lastStat) + } + relayUptime.WithLabelValues(host).Set(float64(stats.UptimeSeconds)) relayPendingSessionKeys.WithLabelValues(host).Set(float64(stats.PendingSessionKeys)) relayActiveSessions.WithLabelValues(host).Set(float64(stats.ActiveSessions)) @@ -198,6 +205,7 @@ func updateMetrics(host string, stats *stats, location location) { relayGoRoutines.WithLabelValues(host).Set(float64(stats.GoRoutines)) relaySessionRate.WithLabelValues(host).Set(float64(stats.Options.SessionRate)) relayGlobalRate.WithLabelValues(host).Set(float64(stats.Options.GlobalRate)) + lastStats[host] = stats } func deleteMetrics(host string) { @@ -210,4 +218,33 @@ func deleteMetrics(host string) { relayGoRoutines.DeleteLabelValues(host) relaySessionRate.DeleteLabelValues(host) relayGlobalRate.DeleteLabelValues(host) + delete(lastStats, host) +} + +// Due to some unexplainable behaviour, some of the numbers sometimes travel slightly backwards (by less than 1%) +// This happens between scrapes, which is 30s, so this can't be a race. +// This causes prometheus to assume a "rate reset", hence causes phenomenal spikes. +// One of the number that moves backwards is BytesProxied, which atomically increments a counter with numeric value +// returned by net.Conn.Read(). I don't think that can return a negative value, so I have no idea what's going on. +func mergeStats(new stats, old stats) stats { + new.UptimeSeconds = mergeValue(new.UptimeSeconds, old.UptimeSeconds) + new.PendingSessionKeys = mergeValue(new.PendingSessionKeys, old.PendingSessionKeys) + new.ActiveSessions = mergeValue(new.ActiveSessions, old.ActiveSessions) + new.Connections = mergeValue(new.Connections, old.Connections) + new.Proxies = mergeValue(new.Proxies, old.Proxies) + new.BytesProxied = mergeValue(new.BytesProxied, old.BytesProxied) + new.GoRoutines = mergeValue(new.GoRoutines, old.GoRoutines) + new.Options.SessionRate = mergeValue(new.Options.SessionRate, old.Options.SessionRate) + new.Options.GlobalRate = mergeValue(new.Options.GlobalRate, old.Options.GlobalRate) + return new +} + +func mergeValue(new, old int) int { + if new >= old { + return new // normal increase + } + if float64(new) > 0.99*float64(old) { + return old // slight backward movement + } + return new // reset (relay restart) } diff --git a/cmd/strelaypoolsrv/stats_test.go b/cmd/strelaypoolsrv/stats_test.go new file mode 100644 index 000000000..414696d79 --- /dev/null +++ b/cmd/strelaypoolsrv/stats_test.go @@ -0,0 +1,21 @@ +// Copyright (C) 2015 Audrius Butkevicius and Contributors (see the CONTRIBUTORS file). + +package main + +import ( + "testing" +) + +func TestMerge(t *testing.T) { + if mergeValue(1001, 1000) != 1001 { + t.Error("the computer says no") + } + + if mergeValue(999, 1000) != 1000 { + t.Error("the computer says no") + } + + if mergeValue(1, 1000) != 1 { + t.Error("the computer says no") + } +}