stats: add query max wait time metric

Add new statistic value to know the size of the queue in microseconds.

Signed-off-by: Vadim Fedorenko <vadfed@meta.com>
This commit is contained in:
Vadim Fedorenko 2023-04-21 09:23:21 -07:00
parent e577ab105e
commit 263096d1f6
5 changed files with 16 additions and 4 deletions

View File

@ -681,6 +681,8 @@ print_stats(RES* ssl, const char* nm, struct ub_stats_info* s)
(unsigned long)s->svr.num_queries_prefetch)) return 0;
if(!ssl_printf(ssl, "%s.num.queries_timed_out"SQ"%lu\n", nm,
(unsigned long)s->svr.num_queries_timed_out)) return 0;
if(!ssl_printf(ssl, "%s.query.queue_time_us.max"SQ"%lu\n", nm,
(unsigned long)s->svr.max_query_time_us)) return 0;
if(!ssl_printf(ssl, "%s.num.expired"SQ"%lu\n", nm,
(unsigned long)s->svr.ans_expired)) return 0;
if(!ssl_printf(ssl, "%s.num.recursivereplies"SQ"%lu\n", nm,

View File

@ -433,6 +433,8 @@ void server_stats_add(struct ub_stats_info* total, struct ub_stats_info* a)
total->svr.num_queries_missed_cache += a->svr.num_queries_missed_cache;
total->svr.num_queries_prefetch += a->svr.num_queries_prefetch;
total->svr.num_queries_timed_out += a->svr.num_queries_timed_out;
if (total->svr.max_query_time_us < a->svr.max_query_time_us)
total->svr.max_query_time_us = a->svr.max_query_time_us;
total->svr.sum_query_list_size += a->svr.sum_query_list_size;
total->svr.ans_expired += a->svr.ans_expired;
#ifdef USE_DNSCRYPT

View File

@ -1281,6 +1281,7 @@ worker_handle_request(struct comm_point* c, void* arg, int error,
int is_expired_answer = 0;
int is_secure_answer = 0;
int rpz_passthru = 0;
long long wait_queue_time = 0;
/* We might have to chase a CNAME chain internally, in which case
* we'll have up to two replies and combine them to build a complete
* answer. These variables control this case. */
@ -1289,6 +1290,7 @@ worker_handle_request(struct comm_point* c, void* arg, int error,
struct query_info* lookup_qinfo = &qinfo;
struct query_info qinfo_tmp; /* placeholder for lookup_qinfo */
struct respip_client_info* cinfo = NULL, cinfo_tmp;
struct timeval wait_time;
memset(&qinfo, 0, sizeof(qinfo));
if((error != NETEVENT_NOERROR && error != NETEVENT_DONE)|| !repinfo) {
@ -1297,9 +1299,13 @@ worker_handle_request(struct comm_point* c, void* arg, int error,
return 0;
}
if (worker->env.cfg->sock_queue_timeout && timeval_isset(c->recv_tv)) {
c->recv_tv.tv_sec += worker->env.cfg->sock_queue_timeout;
if (timeval_smaller(c->recv_tv, worker->env.now_tv)) {
if (worker->env.cfg->sock_queue_timeout && timeval_isset(&c->recv_tv)) {
timeval_subtract(&wait_time, worker->env.now_tv, &c->recv_tv);
wait_queue_time = wait_time.tv_sec * 1000000 + wait_time.tv_usec;
if (worker->stats.max_query_time_us < wait_queue_time)
worker->stats.max_query_time_us = wait_queue_time;
c->recv_tv.tv_sec += worker->env.cfg->sock_queue_timeout;
if (timeval_smaller(&c->recv_tv, worker->env.now_tv)) {
/* count and drop queries that were sitting in the socket queue too long */
worker->stats.num_queries_timed_out++;
return 0;

View File

@ -701,7 +701,8 @@ struct ub_server_stats {
long long num_queries_prefetch;
/** number of queries which are too late to process */
long long num_queries_timed_out;
/** the longest wait time in the queue */
long long max_query_time_us;
/**
* Sum of the querylistsize of the worker for
* every query that missed cache. To calculate average.

View File

@ -209,6 +209,7 @@ static void pr_stats(const char* nm, struct ub_stats_info* s)
PR_UL_NM("num.cachemiss", s->svr.num_queries_missed_cache);
PR_UL_NM("num.prefetch", s->svr.num_queries_prefetch);
PR_UL_NM("num.queries_timed_out", s->svr.num_queries_timed_out);
PR_UL_NM("query.queue_time_us.max", s->svr.max_query_time_us);
PR_UL_NM("num.expired", s->svr.ans_expired);
PR_UL_NM("num.recursivereplies", s->mesh_replies_sent);
#ifdef USE_DNSCRYPT