stats: add query max wait time metric

Add new statistic value to know the size of the queue in microseconds. Signed-off-by: Vadim Fedorenko <vadfed@meta.com>
2024-09-21 06:37:08 +00:00 · 2023-04-21 09:23:21 -07:00 · 2023-04-21 09:23:21 -07:00 · 263096d1f6
commit 263096d1f6
parent e577ab105e
5 changed files with 16 additions and 4 deletions
--- a/daemon/remote.c
+++ b/daemon/remote.c
@ -681,6 +681,8 @@ print_stats(RES* ssl, const char* nm, struct ub_stats_info* s)
 		(unsigned long)s->svr.num_queries_prefetch)) return 0;
 	if(!ssl_printf(ssl, "%s.num.queries_timed_out"SQ"%lu\n", nm,
 		(unsigned long)s->svr.num_queries_timed_out)) return 0;
+	if(!ssl_printf(ssl, "%s.query.queue_time_us.max"SQ"%lu\n", nm,
+		(unsigned long)s->svr.max_query_time_us)) return 0;
 	if(!ssl_printf(ssl, "%s.num.expired"SQ"%lu\n", nm,
 		(unsigned long)s->svr.ans_expired)) return 0;
 	if(!ssl_printf(ssl, "%s.num.recursivereplies"SQ"%lu\n", nm,
--- a/daemon/stats.c
+++ b/daemon/stats.c
@ -433,6 +433,8 @@ void server_stats_add(struct ub_stats_info* total, struct ub_stats_info* a)
 	total->svr.num_queries_missed_cache += a->svr.num_queries_missed_cache;
 	total->svr.num_queries_prefetch += a->svr.num_queries_prefetch;
 	total->svr.num_queries_timed_out += a->svr.num_queries_timed_out;
+	if (total->svr.max_query_time_us < a->svr.max_query_time_us)
+		total->svr.max_query_time_us = a->svr.max_query_time_us;
 	total->svr.sum_query_list_size += a->svr.sum_query_list_size;
 	total->svr.ans_expired += a->svr.ans_expired;
 #ifdef USE_DNSCRYPT
--- a/daemon/worker.c
+++ b/daemon/worker.c
@ -1281,6 +1281,7 @@ worker_handle_request(struct comm_point* c, void* arg, int error,
 	int is_expired_answer = 0;
 	int is_secure_answer = 0;
 	int rpz_passthru = 0;
+	long long wait_queue_time = 0;
 	/* We might have to chase a CNAME chain internally, in which case
 	 * we'll have up to two replies and combine them to build a complete
 	 * answer.  These variables control this case. */
@ -1289,6 +1290,7 @@ worker_handle_request(struct comm_point* c, void* arg, int error,
 	struct query_info* lookup_qinfo = &qinfo;
 	struct query_info qinfo_tmp; /* placeholder for lookup_qinfo */
 	struct respip_client_info* cinfo = NULL, cinfo_tmp;
+	struct timeval wait_time;
 	memset(&qinfo, 0, sizeof(qinfo));

 	if((error != NETEVENT_NOERROR && error != NETEVENT_DONE)|| !repinfo) {
@ -1297,9 +1299,13 @@ worker_handle_request(struct comm_point* c, void* arg, int error,
 		return 0;
 	}

-	if (worker->env.cfg->sock_queue_timeout && timeval_isset(c->recv_tv)) {
-	  c->recv_tv.tv_sec += worker->env.cfg->sock_queue_timeout;
-		if (timeval_smaller(c->recv_tv, worker->env.now_tv)) {
+	if (worker->env.cfg->sock_queue_timeout && timeval_isset(&c->recv_tv)) {
+		timeval_subtract(&wait_time, worker->env.now_tv, &c->recv_tv);
+		wait_queue_time = wait_time.tv_sec * 1000000 +  wait_time.tv_usec;
+		if (worker->stats.max_query_time_us < wait_queue_time)
+			worker->stats.max_query_time_us = wait_queue_time;
+		c->recv_tv.tv_sec += worker->env.cfg->sock_queue_timeout;
+		if (timeval_smaller(&c->recv_tv, worker->env.now_tv)) {
 		/* count and drop queries that were sitting in the socket queue too long */
 			worker->stats.num_queries_timed_out++;
 			return 0;
--- a/libunbound/unbound.h
+++ b/libunbound/unbound.h
@ -701,7 +701,8 @@ struct ub_server_stats {
 	long long num_queries_prefetch;
 	/** number of queries which are too late to process */
 	long long num_queries_timed_out;
-
+	/** the longest wait time in the queue */
+	long long max_query_time_us;
 	/**
 	 * Sum of the querylistsize of the worker for
 	 * every query that missed cache. To calculate average.
--- a/smallapp/unbound-control.c
+++ b/smallapp/unbound-control.c
@ -209,6 +209,7 @@ static void pr_stats(const char* nm, struct ub_stats_info* s)
 	PR_UL_NM("num.cachemiss", s->svr.num_queries_missed_cache);
 	PR_UL_NM("num.prefetch", s->svr.num_queries_prefetch);
 	PR_UL_NM("num.queries_timed_out", s->svr.num_queries_timed_out);
+	PR_UL_NM("query.queue_time_us.max", s->svr.max_query_time_us);
 	PR_UL_NM("num.expired", s->svr.ans_expired);
 	PR_UL_NM("num.recursivereplies", s->mesh_replies_sent);
 #ifdef USE_DNSCRYPT