From 2442f7716b76052f4c055447d7dca060b7c478f0 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 18 Oct 2018 15:44:10 -0400 Subject: [PATCH 01/70] cmake: move urcu library to main build Signed-off-by: Jeff Layton --- CMakeLists.txt | 3 +++ cmake/modules/FindLTTng.cmake | 3 +-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index cb40a90e4b..d96ee70a44 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -180,6 +180,8 @@ include_directories(BEFORE "${EXTRA_INCLUDE_DIR}" ) +find_library(LIBURCU urcu) + # Find misc system libs find_library(LIBRT rt) # extended Pthreads functions @@ -188,6 +190,7 @@ set(SYSTEM_LIBRARIES ${KRB5_LIBRARIES} ${SYSTEM_LIBRARIES} ${LIBRT} + ${LIBURCU} ) if (NOT FREEBSD) diff --git a/cmake/modules/FindLTTng.cmake b/cmake/modules/FindLTTng.cmake index bddecad4d2..6e850f8eec 100644 --- a/cmake/modules/FindLTTng.cmake +++ b/cmake/modules/FindLTTng.cmake @@ -40,10 +40,9 @@ find_path(LTTNG_LIBRARY_DIR DOC "The LTTng libraries") find_library(LTTNG_UST_LIBRARY lttng-ust PATHS ${LTTNG_LIBRARY_DIR}) -find_library(URCU_LIBRARY urcu-bp PATHS ${LTTNG_LIBRARY_DIR}) find_library(UUID_LIBRARY uuid) -set(LTTNG_LIBRARIES ${LTTNG_UST_LIBRARY} ${URCU_LIBRARY} ${UUID_LIBRARY}) +set(LTTNG_LIBRARIES ${LTTNG_UST_LIBRARY} ${UUID_LIBRARY}) find_path(LTTNG_CTL_INCLUDE_DIR NAMES lttng/lttng.h From ea57c9de876334b39049ea8ef71c24f9af0ceabe Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 18 Oct 2018 15:57:03 -0400 Subject: [PATCH 02/70] pthreads: have all thread functions register themselves with urcu Signed-off-by: Jeff Layton --- src/rpc_rdma.c | 10 +++++++--- src/work_pool.c | 4 ++++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/src/rpc_rdma.c b/src/rpc_rdma.c index 9cd8203b22..6af608d369 100644 --- a/src/rpc_rdma.c +++ b/src/rpc_rdma.c @@ -51,6 +51,7 @@ #include //fcntl #include //fcntl #include +#include #define EPOLL_SIZE (10) /*^ expected number of fd, must be > 0 */ @@ -571,6 +572,7 @@ rpc_rdma_stats_thread(void *arg) int n; int rc; + rcu_register_thread(); while (rpc_rdma_state.run_count > 0) { n = epoll_wait(rpc_rdma_state.stats_epollfd, epoll_events, EPOLL_EVENTS, EPOLL_WAIT_MS); @@ -634,7 +636,7 @@ rpc_rdma_stats_thread(void *arg) rc = close(childfd); } } - + rcu_unregister_thread(); pthread_exit(NULL); } @@ -836,6 +838,7 @@ rpc_rdma_cq_thread(void *arg) int n; int rc; + rcu_register_thread(); while (rpc_rdma_state.run_count > 0) { n = epoll_wait(rpc_rdma_state.cq_epollfd, epoll_events, EPOLL_EVENTS, EPOLL_WAIT_MS); @@ -895,7 +898,7 @@ rpc_rdma_cq_thread(void *arg) mutex_unlock(&xprt->cm_lock); } } - + rcu_unregister_thread(); pthread_exit(NULL); } @@ -1091,6 +1094,7 @@ rpc_rdma_cm_thread(void *nullarg) int n; int rc; + rcu_register_thread(); while (rpc_rdma_state.run_count > 0) { n = epoll_wait(rpc_rdma_state.cm_epollfd, epoll_events, EPOLL_EVENTS, EPOLL_WAIT_MS); @@ -1166,7 +1170,7 @@ rpc_rdma_cm_thread(void *nullarg) SVC_DESTROY(&cm_xprt->sm_dr.xprt); } } - + rcu_unregister_thread(); pthread_exit(NULL); } diff --git a/src/work_pool.c b/src/work_pool.c index ea63602a1f..323637b406 100644 --- a/src/work_pool.c +++ b/src/work_pool.c @@ -55,6 +55,7 @@ #include #include #include +#include #include @@ -149,6 +150,8 @@ work_pool_thread(void *arg) int rc; bool spawn; + rcu_register_thread(); + pthread_cond_init(&wpt->pqcond, NULL); pthread_mutex_lock(&pool->pqh.qmutex); TAILQ_INSERT_TAIL(&pool->wptqh, wpt, wptq); @@ -238,6 +241,7 @@ work_pool_thread(void *arg) __func__, wpt->worker_name); cond_destroy(&wpt->pqcond); mem_free(wpt, sizeof(*wpt)); + rcu_unregister_thread(); return (NULL); } From ca74cde10ef02a322b8944a6c8639b1318fa34dc Mon Sep 17 00:00:00 2001 From: Daniel Gryniewicz Date: Fri, 19 Oct 2018 14:13:51 -0400 Subject: [PATCH 03/70] svc_xprt_lookup - Add extra ref on create An xprt has a ref for the hash table (that's released by SVC_DESTROY()); but when it's first created, only 1 ref was taken, so there wasn't a ref for the caller. Add an extra ref for the caller when the xprt is first created. Signed-off-by: Daniel Gryniewicz --- src/svc_xprt.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/svc_xprt.c b/src/svc_xprt.c index 745b2164a7..d8d935aed9 100644 --- a/src/svc_xprt.c +++ b/src/svc_xprt.c @@ -174,6 +174,9 @@ svc_xprt_lookup(int fd, svc_xprt_setup_t setup) xprt->xp_fd = fd; xprt->xp_flags = SVC_XPRT_FLAG_INITIAL; + /* Get ref for caller */ + SVC_REF(xprt, SVC_REF_FLAG_NONE); + rec = REC_XPRT(xprt); rpc_dplx_rli(rec); if (opr_rbtree_insert(&t->t, &rec->fd_node)) { From bc284380700bdff7a038e19f43e7938405768898 Mon Sep 17 00:00:00 2001 From: Daniel Gryniewicz Date: Mon, 29 Oct 2018 11:49:29 -0400 Subject: [PATCH 04/70] Fix addr size check in clnt_dg_control() The check to see if the given address will fit in the storage is backwards. Fix it, so that IPv4 can fit in the storage, and so that a huge address doesn't overflow. Signed-off-by: Daniel Gryniewicz --- src/clnt_dg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/clnt_dg.c b/src/clnt_dg.c index 2610046bcc..6e024b6416 100644 --- a/src/clnt_dg.c +++ b/src/clnt_dg.c @@ -332,7 +332,7 @@ clnt_dg_control(CLIENT *clnt, u_int request, void *info) break; case CLSET_SVC_ADDR: /* set to new address */ addr = (struct netbuf *)info; - if (addr->len < sizeof(cu->cu_raddr)) { + if (addr->len > sizeof(cu->cu_raddr)) { rslt = false; break; From 56ade494dd64e18042f8c8d44c02b145a71ac4ba Mon Sep 17 00:00:00 2001 From: Jack Halford Date: Fri, 19 Oct 2018 17:37:00 +0200 Subject: [PATCH 05/70] Update timespec macros timespec[add|sub|cmp] are now public in FreeBSD 12, they now have an additional variable similar to NetBSD. We use the system macros by default, otherwise the updated version has been implemented. Signed-off-by: Jack Halford --- ntirpc/misc/timespec.h | 49 +++++++++++++++++++++++++----------------- src/clnt_generic.c | 2 +- src/svc_rqst.c | 2 +- 3 files changed, 31 insertions(+), 22 deletions(-) diff --git a/ntirpc/misc/timespec.h b/ntirpc/misc/timespec.h index 574a41a68f..5b04a45941 100644 --- a/ntirpc/misc/timespec.h +++ b/ntirpc/misc/timespec.h @@ -40,19 +40,24 @@ /* Operations on timespecs */ #define timespecclear(tvp) ((tvp)->tv_sec = (tvp)->tv_nsec = 0) #define timespecisset(tvp) ((tvp)->tv_sec || (tvp)->tv_nsec) -#define timespeccmp(tvp, uvp, cmp) \ - (((tvp)->tv_sec == (uvp)->tv_sec) ? \ - ((tvp)->tv_nsec cmp(uvp)->tv_nsec) : \ - ((tvp)->tv_sec cmp(uvp)->tv_sec)) -#define timespecadd(vvp, uvp) \ - do { \ - (vvp)->tv_sec += (uvp)->tv_sec; \ - (vvp)->tv_nsec += (uvp)->tv_nsec; \ - if ((vvp)->tv_nsec >= 1000000000) { \ - (vvp)->tv_sec++; \ - (vvp)->tv_nsec -= 1000000000; \ - } \ +#ifndef timespeccmp +#define timespeccmp(tvp, uvp, cmp) \ + (((tvp)->tv_sec == (uvp)->tv_sec) ? \ + ((tvp)->tv_nsec cmp (uvp)->tv_nsec) : \ + ((tvp)->tv_sec cmp (uvp)->tv_sec)) +#endif /* timespeccmp */ +#ifndef timespecadd +#define timespecadd(tsp, usp, vsp) \ + do { \ + (vsp)->tv_sec = (tsp)->tv_sec + (usp)->tv_sec; \ + (vsp)->tv_nsec = (tsp)->tv_nsec + (usp)->tv_nsec; \ + if ((vsp)->tv_nsec >= 1000000000L) { \ + (vsp)->tv_sec++; \ + (vsp)->tv_nsec -= 1000000000L; \ + } \ } while (0) +#endif /* timespecadd */ + #define timespec_adds(vvp, s) \ do { \ (vvp)->tv_sec += s; \ @@ -67,13 +72,17 @@ (vvp)->tv_nsec -= 1000000000; \ } \ } while (0) -#define timespecsub(vvp, uvp) \ - do { \ - (vvp)->tv_sec -= (uvp)->tv_sec; \ - (vvp)->tv_nsec -= (uvp)->tv_nsec; \ - if ((vvp)->tv_nsec < 0) { \ - (vvp)->tv_sec--; \ - (vvp)->tv_nsec += 1000000000; \ - } \ + +#ifndef timespecsub +#define timespecsub(tsp, usp, vsp) \ + do { \ + (vsp)->tv_sec = (tsp)->tv_sec - (usp)->tv_sec; \ + (vsp)->tv_nsec = (tsp)->tv_nsec - (usp)->tv_nsec; \ + if ((vsp)->tv_nsec < 0) { \ + (vsp)->tv_sec--; \ + (vsp)->tv_nsec += 1000000000L; \ + } \ } while (0) +#endif /* timespecsub */ + #endif /* TIMESPEC_H */ diff --git a/src/clnt_generic.c b/src/clnt_generic.c index aa5ca21902..ebde41434e 100644 --- a/src/clnt_generic.c +++ b/src/clnt_generic.c @@ -639,7 +639,7 @@ clnt_req_wait_reply(struct clnt_req *cc) } (void)clock_gettime(CLOCK_REALTIME_FAST, &ts); - timespecadd(&ts, &cc->cc_timeout); + timespecadd(&ts, &cc->cc_timeout, &ts); code = cond_timedwait(&cc->cc_we.cv, &cc->cc_we.mtx, &ts); __warnx(TIRPC_DEBUG_FLAG_CLNT_REQ, diff --git a/src/svc_rqst.c b/src/svc_rqst.c index ae12d7c459..f70098753c 100644 --- a/src/svc_rqst.c +++ b/src/svc_rqst.c @@ -222,7 +222,7 @@ svc_rqst_expire_ms(struct timespec *to) /* coarse nsec, not system time */ (void)clock_gettime(CLOCK_MONOTONIC_FAST, &ts); - timespecadd(&ts, to); + timespecadd(&ts, to, &ts); return timespec_ms(&ts); } From b696feab5e8ecb61d4230b92663bb62189d40cd7 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 7 Dec 2018 08:14:07 -0500 Subject: [PATCH 06/70] Force urcu-bp for now When _LGPL_SOURCE is defined the lttng headers will inline some of the RCU functions with the "bulletproof" functions. This means that we can't currently select a urcu flavor at runtime that will be used universally. I'm a little unclear on what happens if you mix different flavored urcu calls, but I doubt that it's anything good. Let's switch back to using urcu-bp universally for now until this has a better solution. It's not ideal for performance, but our use of URCU is pretty minimal for now, and it's better to be safe than fast. Signed-off-by: Jeff Layton --- CMakeLists.txt | 2 +- src/rpc_rdma.c | 2 +- src/work_pool.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d96ee70a44..bba99680e7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -180,7 +180,7 @@ include_directories(BEFORE "${EXTRA_INCLUDE_DIR}" ) -find_library(LIBURCU urcu) +find_library(LIBURCU urcu-bp) # Find misc system libs find_library(LIBRT rt) # extended Pthreads functions diff --git a/src/rpc_rdma.c b/src/rpc_rdma.c index 6af608d369..3e9f458eb3 100644 --- a/src/rpc_rdma.c +++ b/src/rpc_rdma.c @@ -51,7 +51,7 @@ #include //fcntl #include //fcntl #include -#include +#include #define EPOLL_SIZE (10) /*^ expected number of fd, must be > 0 */ diff --git a/src/work_pool.c b/src/work_pool.c index 323637b406..7a1b53967d 100644 --- a/src/work_pool.c +++ b/src/work_pool.c @@ -55,7 +55,7 @@ #include #include #include -#include +#include #include From fe64a7b16f052cf7ad87380772de4944d7faadc0 Mon Sep 17 00:00:00 2001 From: Daniel Gryniewicz Date: Wed, 19 Dec 2018 08:42:58 -0500 Subject: [PATCH 07/70] Fix XPRT leak for TCP connections XPRT refcounting could leak a ref in some cases. Fix this by dropping the extra ref in error cases, and by passing the event ref (which should have been dropped, because events are oneshot) to the event processor. This fixes the FD depletion while doing mount/unmount in a tight loop. Signed-off-by: Daniel Gryniewicz --- src/svc_rqst.c | 8 +++++--- src/svc_vc.c | 2 ++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/svc_rqst.c b/src/svc_rqst.c index f70098753c..720ef9a619 100644 --- a/src/svc_rqst.c +++ b/src/svc_rqst.c @@ -477,6 +477,7 @@ svc_rqst_rearm_events(SVCXPRT *xprt) if (sr_rec->ev_flags & SVC_RQST_FLAG_SHUTDOWN) return (0); + SVC_REF(xprt, SVC_REF_FLAG_NONE); rpc_dplx_rli(rec); /* assuming success */ @@ -507,6 +508,7 @@ svc_rqst_rearm_events(SVCXPRT *xprt) sr_rec, sr_rec->id_k, sr_rec->ev_refcnt, sr_rec->ev_u.epoll.epoll_fd, sr_rec->sv[0], sr_rec->sv[1], code); + SVC_RELEASE(xprt, SVC_RELEASE_FLAG_NONE); } else { __warnx(TIRPC_DEBUG_FLAG_SVC_RQST | TIRPC_DEBUG_FLAG_REFCNT, @@ -862,10 +864,10 @@ svc_rqst_epoll_event(struct svc_rqst_rec *sr_rec, struct epoll_event *ev) } /* Another task may release transport in parallel. - * Take extra reference now to keep window as small as possible. - * Under normal circumstances, worker task (above) will release. + * We have a ref from being in epoll, but since epoll is one-shot, a new ref + * will be taken when we re-enter epoll. Use this ref for the processor + * without taking another one. */ - SVC_REF(&rec->xprt, SVC_REF_FLAG_NONE); /* MUST handle flags after reference. * Although another task may unhook, the error is non-fatal. diff --git a/src/svc_vc.c b/src/svc_vc.c index 6301cebf9b..4a762848c6 100644 --- a/src/svc_vc.c +++ b/src/svc_vc.c @@ -507,6 +507,8 @@ svc_vc_rendezvous(SVCXPRT *xprt) if (xprt->xp_dispatch.rendezvous_cb(newxprt) || svc_rqst_xprt_register(newxprt, xprt)) { SVC_DESTROY(newxprt); + /* Was never added to epoll */ + SVC_RELEASE(newxprt, SVC_RELEASE_FLAG_NONE); return (XPRT_DESTROYED); } return (XPRT_IDLE); From 2eda4a66a9ed62090514360ebef297a4396e31fa Mon Sep 17 00:00:00 2001 From: Daniel Gryniewicz Date: Wed, 19 Dec 2018 09:01:24 -0500 Subject: [PATCH 08/70] Make UDP XPRT refs match TCP TCP has a double-ref setup, where the tree has one, and the event loop has one. This allows processors to destroy the xprt on error without freeing it, ensuring no use-after-free. UDP does not have a XPRT tree, since it re-uses a single socket, so it only has one ref. This means that destroy-on-error immediately frees. Add an extra ref to the UDP path so that it matches TCP, and to allow destroy. Signed-off-by: Daniel Gryniewicz --- src/svc_dg.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/svc_dg.c b/src/svc_dg.c index 43de3d6d8d..cc292236d0 100644 --- a/src/svc_dg.c +++ b/src/svc_dg.c @@ -94,6 +94,8 @@ svc_dg_xprt_zalloc(size_t iosz) /* Init SVCXPRT locks, etc */ rpc_dplx_rec_init(&su->su_dr); + /* Extra ref to match TCP */ + SVC_REF(&su->su_dr.xprt, SVC_REF_FLAG_NONE); xdr_ioq_setup(&su->su_dr.ioq); return (su); } @@ -318,6 +320,7 @@ svc_dg_recv(SVCXPRT *xprt) /* Only after checking SVC_XPRT_FLAG_DESTROYED: * because SVC_DESTROY() has decremented already. */ + SVC_DESTROY(xprt); SVC_RELEASE(xprt, SVC_RELEASE_FLAG_NONE); return (stat); } From b86186c0f7654580e615d2386d6eead87e9fa345 Mon Sep 17 00:00:00 2001 From: Daniel Gryniewicz Date: Thu, 20 Dec 2018 09:16:06 -0500 Subject: [PATCH 09/70] Destroy XPRTs for local clients When destroying a client, the XPRT is not also destroyed, but just unref'd. This is fine, if multiple clients share a XPRT, but local clients all have unique FDs, and therefore unique XPRTs. This means that their XPRTs are never destroyed. Ganesha uses these a lot, leading to many leaked XPRTs and therefore FDs. Add a flag on a client that indicates it's local, and use that flag to desrtoy the XPRT when the client is destroyed. Signed-off-by: Daniel Gryniewicz --- ntirpc/rpc/clnt.h | 1 + src/clnt_vc.c | 5 +++++ src/rpcb_clnt.c | 2 ++ 3 files changed, 8 insertions(+) diff --git a/ntirpc/rpc/clnt.h b/ntirpc/rpc/clnt.h index 49eda22638..d442cbfd4a 100644 --- a/ntirpc/rpc/clnt.h +++ b/ntirpc/rpc/clnt.h @@ -182,6 +182,7 @@ struct rpc_timers { #define CLNT_FLAG_DESTROYING SVC_XPRT_FLAG_DESTROYING #define CLNT_FLAG_RELEASING SVC_XPRT_FLAG_RELEASING #define CLNT_FLAG_DESTROYED SVC_XPRT_FLAG_DESTROYED +#define CLNT_FLAG_LOCAL 0x8000 /* Client is unshared/local */ /* * CLNT_REF flags diff --git a/src/clnt_vc.c b/src/clnt_vc.c index 3033a74b8e..ff83c3fb5a 100644 --- a/src/clnt_vc.c +++ b/src/clnt_vc.c @@ -461,6 +461,11 @@ clnt_vc_destroy(CLIENT *clnt) if (cx->cx_rec) { SVC_RELEASE(&cx->cx_rec->xprt, SVC_RELEASE_FLAG_NONE); } + if (clnt->cl_flags & CLNT_FLAG_LOCAL) { + /* Local client; destroy the xprt */ + SVC_DESTROY(&cx->cx_rec->xprt); + } + clnt_vc_data_free(CT_DATA(cx)); } diff --git a/src/rpcb_clnt.c b/src/rpcb_clnt.c index 4bf69e7480..428b18c13e 100644 --- a/src/rpcb_clnt.c +++ b/src/rpcb_clnt.c @@ -1311,6 +1311,8 @@ static CLIENT *local_rpcb(const char *tag) CLNT_CREATE_FLAG_CONNECT); if (CLNT_SUCCESS(client)) { + /* This is a local client (we created the fd above) */ + client->cl_flags |= CLNT_FLAG_LOCAL; return client; } t = rpc_sperror(&client->cl_error, tag); From d16697b4d1da8a9216f5a955005b7cd1ac28a913 Mon Sep 17 00:00:00 2001 From: "Frank S. Filz" Date: Wed, 17 Oct 2018 16:12:16 -0700 Subject: [PATCH 10/70] Simplify epoll task processing Signed-off-by: Frank S. Filz --- src/svc_rqst.c | 100 ++++++++++++++++++++++++++++--------------------- 1 file changed, 57 insertions(+), 43 deletions(-) diff --git a/src/svc_rqst.c b/src/svc_rqst.c index 720ef9a619..49fa8a6a09 100644 --- a/src/svc_rqst.c +++ b/src/svc_rqst.c @@ -195,6 +195,8 @@ svc_rqst_lookup_chan(uint32_t chan_id) /* forward declaration in lieu of moving code {WAS} */ static void svc_rqst_run_task(struct work_pool_entry *); +static void svc_rqst_epoll_loop(struct work_pool_entry *wpe); +static void svc_complete_task(struct svc_rqst_rec *sr_rec, bool finished); static int svc_rqst_expire_cmpf(const struct opr_rbtree_node *lhs, @@ -287,6 +289,7 @@ svc_rqst_new_evchan(uint32_t *chan_id /* OUT */, void *u_data, uint32_t flags) struct svc_rqst_rec *sr_rec; uint32_t n_id; int code = 0; + work_pool_fun_t fun = svc_rqst_run_task; mutex_lock(&svc_rqst_set.mtx); if (!svc_rqst_set.next_id) { @@ -326,6 +329,7 @@ svc_rqst_new_evchan(uint32_t *chan_id /* OUT */, void *u_data, uint32_t flags) #if defined(TIRPC_EPOLL) if (flags & SVC_RQST_FLAG_EPOLL) { sr_rec->ev_type = SVC_EVENT_EPOLL; + fun = svc_rqst_epoll_loop; /* XXX improve this too */ sr_rec->ev_u.epoll.max_events = @@ -381,7 +385,7 @@ svc_rqst_new_evchan(uint32_t *chan_id /* OUT */, void *u_data, uint32_t flags) if (!code) { atomic_inc_int32_t(&sr_rec->ev_refcnt); - sr_rec->ev_wpe.fun = svc_rqst_run_task; + sr_rec->ev_wpe.fun = fun; sr_rec->ev_wpe.arg = u_data; work_pool_submit(&svc_work_pool, &sr_rec->ev_wpe); } @@ -904,7 +908,7 @@ svc_rqst_epoll_event(struct svc_rqst_rec *sr_rec, struct epoll_event *ev) /* * not locked */ -static inline bool +static inline struct rpc_dplx_rec * svc_rqst_epoll_events(struct svc_rqst_rec *sr_rec, int n_events) { struct rpc_dplx_rec *rec = NULL; @@ -919,7 +923,7 @@ svc_rqst_epoll_events(struct svc_rqst_rec *sr_rec, int n_events) if (!rec) { /* continue waiting for events with this task */ - return false; + return NULL; } while (ix < n_events) { @@ -936,28 +940,20 @@ svc_rqst_epoll_events(struct svc_rqst_rec *sr_rec, int n_events) atomic_inc_int32_t(&sr_rec->ev_refcnt); work_pool_submit(&svc_work_pool, &sr_rec->ev_wpe); - /* in most cases have only one event, use this hot thread */ - rec->ioq.ioq_wpe.fun = svc_rqst_xprt_task; - svc_rqst_xprt_task(&(rec->ioq.ioq_wpe)); - - /* failsafe idle processing after work task */ - if (atomic_postclear_uint32_t_bits(&wakeups, ~SVC_RQST_WAKEUPS) - > SVC_RQST_WAKEUPS) { - svc_rqst_clean_idle(__svc_params->idle_timeout); - } - - return true; + return rec; } -static inline bool -svc_rqst_epoll_loop(struct svc_rqst_rec *sr_rec) +static void svc_rqst_epoll_loop(struct work_pool_entry *wpe) { + struct svc_rqst_rec *sr_rec = + opr_containerof(wpe, struct svc_rqst_rec, ev_wpe); struct clnt_req *cc; struct opr_rbtree_node *n; struct timespec ts; int timeout_ms; int expire_ms; int n_events; + bool finished; for (;;) { timeout_ms = SVC_RQST_TIMEOUT_MS; @@ -1006,13 +1002,30 @@ svc_rqst_epoll_loop(struct svc_rqst_rec *sr_rec) __func__, sr_rec->ev_u.epoll.epoll_fd, n_events); - return true; + finished = true; + break; } if (n_events > 0) { atomic_add_uint32_t(&wakeups, n_events); - - if (svc_rqst_epoll_events(sr_rec, n_events)) - return false; + struct rpc_dplx_rec *rec; + + rec = svc_rqst_epoll_events(sr_rec, n_events); + + if (rec != NULL) { + /* use this hot thread for the first event */ + rec->ioq.ioq_wpe.fun = svc_rqst_xprt_task; + svc_rqst_xprt_task(&(rec->ioq.ioq_wpe)); + + /* failsafe idle processing after work task */ + if (atomic_postclear_uint32_t_bits( + &wakeups, ~SVC_RQST_WAKEUPS) + > SVC_RQST_WAKEUPS) { + svc_rqst_clean_idle( + __svc_params->idle_timeout); + } + finished = false; + break; + } continue; } if (!n_events) { @@ -1027,12 +1040,34 @@ svc_rqst_epoll_loop(struct svc_rqst_rec *sr_rec) __func__, sr_rec->ev_u.epoll.epoll_fd, n_events); - return true; + finished = true; + break; } } + if (finished) { + close(sr_rec->ev_u.epoll.epoll_fd); + mem_free(sr_rec->ev_u.epoll.events, + sr_rec->ev_u.epoll.max_events * + sizeof(struct epoll_event)); + } + + svc_complete_task(sr_rec, finished); } #endif +static void svc_complete_task(struct svc_rqst_rec *sr_rec, bool finished) +{ + if (finished) { + /* reference count here should be 2: + * 1 svc_rqst_set + * +1 this work_pool thread + * so, DROP one here so the final release will go to 0. + */ + atomic_dec_int32_t(&sr_rec->ev_refcnt); /* svc_rqst_set */ + } + svc_rqst_release(sr_rec); +} + /* * No locking, "there can be only one" */ @@ -1041,23 +1076,10 @@ svc_rqst_run_task(struct work_pool_entry *wpe) { struct svc_rqst_rec *sr_rec = opr_containerof(wpe, struct svc_rqst_rec, ev_wpe); - bool finished; /* enter event loop */ switch (sr_rec->ev_type) { -#if defined(TIRPC_EPOLL) - case SVC_EVENT_EPOLL: - finished = svc_rqst_epoll_loop(sr_rec); - if (finished) { - close(sr_rec->ev_u.epoll.epoll_fd); - mem_free(sr_rec->ev_u.epoll.events, - sr_rec->ev_u.epoll.max_events * - sizeof(struct epoll_event)); - } - break; -#endif default: - finished = true; /* XXX formerly select/fd_set case, now placeholder for new * event systems, reworked select, etc. */ __warnx(TIRPC_DEBUG_FLAG_ERROR, @@ -1066,15 +1088,7 @@ svc_rqst_run_task(struct work_pool_entry *wpe) break; } /* switch */ - if (finished) { - /* reference count here should be 2: - * 1 svc_rqst_set - * +1 this work_pool thread - * so, DROP one here so the final release will go to 0. - */ - atomic_dec_int32_t(&sr_rec->ev_refcnt); /* svc_rqst_set */ - } - svc_rqst_release(sr_rec); + svc_complete_task(sr_rec, true); } int From 3c70a04da38e1194f19f35faaf5ecae66b91c319 Mon Sep 17 00:00:00 2001 From: "Frank S. Filz" Date: Thu, 18 Oct 2018 10:35:47 -0700 Subject: [PATCH 11/70] Flatten call stack a bit - replace request_cb with alloc_cb and free_cb Instead of calling request_cb which alllocates an svc_req and then calls SVC_DECODE and then frees the request, we call alloc_cb, call SVC_DECODE ourselves, and then call free_cb. This makes it a little easier to handle suspending an async request in the future. Signed-off-by: Frank S. Filz --- ntirpc/rpc/svc.h | 6 ++++-- src/svc.c | 3 ++- src/svc_dg.c | 2 +- src/svc_internal.h | 5 ++++- src/svc_raw.c | 2 +- src/svc_rqst.c | 20 ++++++++++++++++++++ src/svc_vc.c | 2 +- src/xdr_rdma.c | 2 +- tests/rpcping.c | 20 +++++++++----------- 9 files changed, 43 insertions(+), 19 deletions(-) diff --git a/ntirpc/rpc/svc.h b/ntirpc/rpc/svc.h index 27fa29416a..8f95298243 100644 --- a/ntirpc/rpc/svc.h +++ b/ntirpc/rpc/svc.h @@ -118,11 +118,13 @@ enum xprt_stat { #define RPC_SVC_FDSET_SET 5 typedef enum xprt_stat (*svc_xprt_fun_t) (SVCXPRT *); -typedef enum xprt_stat (*svc_xprt_xdr_fun_t) (SVCXPRT *, XDR *); +typedef struct svc_req *(*svc_xprt_alloc_fun_t) (SVCXPRT *, XDR *); +typedef void (*svc_xprt_free_fun_t) (struct svc_req *, enum xprt_stat); typedef struct svc_init_params { svc_xprt_fun_t disconnect_cb; - svc_xprt_xdr_fun_t request_cb; + svc_xprt_alloc_fun_t alloc_cb; + svc_xprt_free_fun_t free_cb; u_long flags; u_int max_connections; /* xprts */ diff --git a/src/svc.c b/src/svc.c index c75c048a1d..6932033ece 100644 --- a/src/svc.c +++ b/src/svc.c @@ -139,7 +139,8 @@ svc_init(svc_init_params *params) return true; } __svc_params->disconnect_cb = params->disconnect_cb; - __svc_params->request_cb = params->request_cb; + __svc_params->alloc_cb = params->alloc_cb; + __svc_params->free_cb = params->free_cb; __svc_params->max_connections = (params->max_connections) ? params->max_connections : FD_SETSIZE; diff --git a/src/svc_dg.c b/src/svc_dg.c index cc292236d0..7e13d57f6b 100644 --- a/src/svc_dg.c +++ b/src/svc_dg.c @@ -312,7 +312,7 @@ svc_dg_recv(SVCXPRT *xprt) /* pass the xdrs to user to store in struct svc_req, as most of * the work has already been done on rendezvous */ - stat = __svc_params->request_cb(xprt, REC_XPRT(xprt)->ioq.xdrs); + stat = svc_request(xprt, REC_XPRT(xprt)->ioq.xdrs); if (xprt->xp_flags & SVC_XPRT_FLAG_DESTROYED) return (XPRT_DESTROYED); diff --git a/src/svc_internal.h b/src/svc_internal.h index 56cb60af32..2d9b63e947 100644 --- a/src/svc_internal.h +++ b/src/svc_internal.h @@ -56,7 +56,8 @@ struct svc_params { } ev_u; svc_xprt_fun_t disconnect_cb; - svc_xprt_xdr_fun_t request_cb; + svc_xprt_alloc_fun_t alloc_cb; + svc_xprt_free_fun_t free_cb; struct { int ctx_hash_partitions; @@ -76,6 +77,8 @@ struct svc_params { int32_t idle_timeout; }; +enum xprt_stat svc_request(SVCXPRT *xprt, XDR *xdrs); + extern struct svc_params __svc_params[1]; /* diff --git a/src/svc_raw.c b/src/svc_raw.c index 420013e4f0..8519ba5e99 100644 --- a/src/svc_raw.c +++ b/src/svc_raw.c @@ -141,7 +141,7 @@ svc_raw_recv(SVCXPRT *xprt) } mutex_unlock(&svcraw_lock); - return (__svc_params->request_cb(xprt, srp->raw_dr.ioq.xdrs)); + return svc_request(xprt, srp->raw_dr.ioq.xdrs); } static enum xprt_stat diff --git a/src/svc_rqst.c b/src/svc_rqst.c index 49fa8a6a09..79657d27fe 100644 --- a/src/svc_rqst.c +++ b/src/svc_rqst.c @@ -49,6 +49,7 @@ #include "clnt_internal.h" #include "svc_internal.h" #include "svc_xprt.h" +#include /** * @file svc_rqst.c @@ -777,6 +778,25 @@ svc_rqst_xprt_task(struct work_pool_entry *wpe) SVC_RELEASE(&rec->xprt, SVC_RELEASE_FLAG_NONE); } +enum xprt_stat svc_request(SVCXPRT *xprt, XDR *xdrs) +{ + static enum xprt_stat stat; + struct svc_req *req = __svc_params->alloc_cb(xprt, xdrs); + + stat = SVC_DECODE(req); + + if (req->rq_auth) + SVCAUTH_RELEASE(req); + + XDR_DESTROY(req->rq_xdrs); + + __svc_params->free_cb(req, stat); + + SVC_RELEASE(xprt, SVC_RELEASE_FLAG_NONE); + + return stat; +} + /* * Like __svc_clean_idle but event-type independent. For now no cleanfds. */ diff --git a/src/svc_vc.c b/src/svc_vc.c index 4a762848c6..dda96cd194 100644 --- a/src/svc_vc.c +++ b/src/svc_vc.c @@ -797,7 +797,7 @@ svc_vc_recv(SVCXPRT *xprt) return SVC_STAT(xprt); } - return (__svc_params->request_cb(xprt, xioq->xdrs)); + return svc_request(xprt, xioq->xdrs); } static enum xprt_stat diff --git a/src/xdr_rdma.c b/src/xdr_rdma.c index 0e1394f888..e7f4dfe549 100644 --- a/src/xdr_rdma.c +++ b/src/xdr_rdma.c @@ -338,7 +338,7 @@ xdr_rdma_wrap_callback(struct rpc_rdma_cbc *cbc, RDMAXPRT *xprt) { XDR *xdrs = cbc->holdq.xdrs; - return (int)__svc_params->request_cb(&xprt->sm_dr.xprt, xdrs); + return (int)svc_request(&xprt->sm_dr.xprt, xdrs); } /***********************************/ diff --git a/tests/rpcping.c b/tests/rpcping.c index 61974e4532..782bd4da26 100644 --- a/tests/rpcping.c +++ b/tests/rpcping.c @@ -194,26 +194,23 @@ worker(void *arg) return NULL; } -static enum xprt_stat -decode_request(SVCXPRT *xprt, XDR *xdrs) +static struct svc_req * +alloc_request(SVCXPRT *xprt, XDR *xdrs) { struct svc_req *req = calloc(1, sizeof(*req)); - enum xprt_stat stat; SVC_REF(xprt, SVC_REF_FLAG_NONE); req->rq_xprt = xprt; req->rq_xdrs = xdrs; req->rq_refcnt = 1; - stat = SVC_DECODE(req); - - if (req->rq_auth) - SVCAUTH_RELEASE(req); + return req; +} - XDR_DESTROY(req->rq_xdrs); - SVC_RELEASE(xprt, SVC_RELEASE_FLAG_NONE); +static void +free_request(struct svc_req *req, enum xprt_stat stat) +{ free(req); - return stat; } static void usage() @@ -315,7 +312,8 @@ int main(int argc, char *argv[]) } memset(&svc_params, 0, sizeof(svc_params)); - svc_params.request_cb = decode_request; + svc_params.alloc_cb = alloc_request; + svc_params.free_cb = free_request; svc_params.flags = SVC_INIT_EPOLL | SVC_INIT_NOREG_XPRTS; svc_params.max_events = 512; svc_params.ioq_thrd_max = nworkers; From 80693104737c8ed8473371a34a71d168f2ced912 Mon Sep 17 00:00:00 2001 From: "Frank S. Filz" Date: Mon, 22 Oct 2018 07:16:36 -0700 Subject: [PATCH 12/70] Add ability to suspend requests If the protocol layer needs to save some data (it probably does), it should use SVCXPRT->xp_u1 or SVCXPRT->xp_u2 (Ganesha only uses xp_u2 so it could use xp_u1). If a request is to be suspended, the SVCXPRT and svc_req should not be used once it is possible for another thread to be processing an async completion of the request. This is because if such an async completion happened to run before the original thread returned up the stack, the request could actually be requeued and even executing on another RPC thread while the first thread is still unwinding. Signed-off-by: Frank S. Filz --- ntirpc/rpc/svc.h | 5 ++++ src/libntirpc.map.in.cmake | 1 + src/rpc_dplx_internal.h | 1 + src/svc_rqst.c | 54 +++++++++++++++++++++++++++++++++++++- 4 files changed, 60 insertions(+), 1 deletion(-) diff --git a/ntirpc/rpc/svc.h b/ntirpc/rpc/svc.h index 8f95298243..169fe6b0f0 100644 --- a/ntirpc/rpc/svc.h +++ b/ntirpc/rpc/svc.h @@ -60,6 +60,7 @@ typedef struct svc_xprt SVCXPRT; enum xprt_stat { XPRT_IDLE = 0, XPRT_MOREREQS, + XPRT_SUSPEND, /* always last in this order for comparisons */ XPRT_DIED, XPRT_DESTROYED @@ -237,6 +238,8 @@ struct svc_xprt { svc_req_fun_t process_cb; svc_xprt_fun_t rendezvous_cb; } xp_dispatch; + /* Handle resumed requests */ + svc_req_fun_t xp_resume_cb; SVCXPRT *xp_parent; char *xp_tp; /* transport provider device name */ @@ -330,6 +333,8 @@ struct svc_req { #define svc_getrpccaller(x) (&(x)->xp_remote.ss) #define svc_getrpclocal(x) (&(x)->xp_local.ss) +extern void svc_resume(struct svc_req *req); + /* * Ganesha. Get connected transport type. */ diff --git a/src/libntirpc.map.in.cmake b/src/libntirpc.map.in.cmake index 2e465df66d..7cec34f02d 100644 --- a/src/libntirpc.map.in.cmake +++ b/src/libntirpc.map.in.cmake @@ -135,6 +135,7 @@ NTIRPC_${NTIRPC_VERSION} { svc_ncreate; svc_raw_ncreate; svc_reg; + svc_resume; svc_rqst_new_evchan; svc_rqst_evchan_reg; svc_rqst_evchan_unreg; diff --git a/src/rpc_dplx_internal.h b/src/rpc_dplx_internal.h index 5c6a4dc346..2bb89cd8de 100644 --- a/src/rpc_dplx_internal.h +++ b/src/rpc_dplx_internal.h @@ -76,6 +76,7 @@ struct rpc_dplx_rec { u_int sendsz; uint32_t call_xid; /**< current call xid */ uint32_t ev_count; /**< atomic count of waiting events */ + struct svc_req *svc_req; /**< svc_req we are processing */ }; #define REC_XPRT(p) (opr_containerof((p), struct rpc_dplx_rec, xprt)) diff --git a/src/svc_rqst.c b/src/svc_rqst.c index 79657d27fe..a31fcf61e3 100644 --- a/src/svc_rqst.c +++ b/src/svc_rqst.c @@ -780,11 +780,26 @@ svc_rqst_xprt_task(struct work_pool_entry *wpe) enum xprt_stat svc_request(SVCXPRT *xprt, XDR *xdrs) { - static enum xprt_stat stat; + enum xprt_stat stat; struct svc_req *req = __svc_params->alloc_cb(xprt, xdrs); + struct rpc_dplx_rec *rpc_dplx_rec = REC_XPRT(xprt); + /* Track the request we are processing */ + rpc_dplx_rec->svc_req = req; + + /* All decode functions basically do a + * return xprt->xp_dispatch.process_cb(req); + */ stat = SVC_DECODE(req); + if (stat == XPRT_SUSPEND) { + /* The rquest is suspended, don't touch the request in any way + * because the resume may already be scheduled and running on + * another thread. + */ + return XPRT_SUSPEND; + } + if (req->rq_auth) SVCAUTH_RELEASE(req); @@ -797,6 +812,43 @@ enum xprt_stat svc_request(SVCXPRT *xprt, XDR *xdrs) return stat; } +static void svc_resume_task(struct work_pool_entry *wpe) +{ + struct rpc_dplx_rec *rec = + opr_containerof(wpe, struct rpc_dplx_rec, ioq.ioq_wpe); + struct svc_req *req = rec->svc_req; + SVCXPRT *xprt = &rec->xprt; + enum xprt_stat stat; + + /* Resume the request. */ + stat = req->rq_xprt->xp_resume_cb(req); + + if (stat == XPRT_SUSPEND) { + /* The rquest is suspended, don't touch the request in any way + * because the resume may already be scheduled and running on + * another thread. + */ + return; + } + + if (req->rq_auth) + SVCAUTH_RELEASE(req); + + XDR_DESTROY(req->rq_xdrs); + + __svc_params->free_cb(req, stat); + + SVC_RELEASE(xprt, SVC_RELEASE_FLAG_NONE); +} + +void svc_resume(struct svc_req *req) +{ + struct rpc_dplx_rec *rpc_dplx_rec = REC_XPRT(req->rq_xprt); + + rpc_dplx_rec->ioq.ioq_wpe.fun = svc_resume_task; + work_pool_submit(&svc_work_pool, &(rpc_dplx_rec->ioq.ioq_wpe)); +} + /* * Like __svc_clean_idle but event-type independent. For now no cleanfds. */ From 0104cd166c97a4693b452d47811ae255b73b3d45 Mon Sep 17 00:00:00 2001 From: Malahal Naineni Date: Thu, 24 Jan 2019 09:43:24 +0530 Subject: [PATCH 13/70] Convert unsafe strncpy to strlcpy Fix security warnings from AppScan tool --- src/clnt_perror.c | 2 +- src/netnamer.c | 2 +- src/rpc_generic.c | 2 +- src/rpcb_clnt.c | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/clnt_perror.c b/src/clnt_perror.c index 545efe8f90..8c13e758d3 100644 --- a/src/clnt_perror.c +++ b/src/clnt_perror.c @@ -72,7 +72,7 @@ rpc_sperror(const struct rpc_err *e, const char *s) len -= i; } - (void)strncpy(str, clnt_sperrno(e->re_status), len - 1); + (void)strlcpy(str, clnt_sperrno(e->re_status), len); i = strlen(str); str += i; len -= i; diff --git a/src/netnamer.c b/src/netnamer.c index 0b14b33647..7d3f06013e 100644 --- a/src/netnamer.c +++ b/src/netnamer.c @@ -190,7 +190,7 @@ int netname2host(char netname[MAXNETNAMELEN + 1], char *hostname, int hostlen) if (getnetid(netname, valbuf)) { val = valbuf; if ((*val == '0') && (val[1] == ':')) { - (void)strncpy(hostname, val + 2, hostlen); + (void)strlcpy(hostname, val + 2, hostlen); return (1); } } diff --git a/src/rpc_generic.c b/src/rpc_generic.c index bf69aa7b89..5373117afa 100644 --- a/src/rpc_generic.c +++ b/src/rpc_generic.c @@ -957,7 +957,7 @@ __rpc_uaddr2taddr_af(int af, const char *uaddr) sun = (struct sockaddr_un *)mem_zalloc(sizeof(*sun)); sun->sun_family = AF_LOCAL; - strncpy(sun->sun_path, addrstr, sizeof(sun->sun_path) - 1); + strlcpy(sun->sun_path, addrstr, sizeof(sun->sun_path)); ret->len = SUN_LEN(sun); ret->maxlen = sizeof(struct sockaddr_un); ret->buf = sun; diff --git a/src/rpcb_clnt.c b/src/rpcb_clnt.c index 428b18c13e..fc5fdb2bba 100644 --- a/src/rpcb_clnt.c +++ b/src/rpcb_clnt.c @@ -338,7 +338,7 @@ static CLIENT *getclnthandle(const char *host, const struct netconfig *nconf, if (targaddr) { *targaddr = mem_zalloc(sizeof(sun.sun_path)); - strncpy(*targaddr, _PATH_RPCBINDSOCK, + strlcpy(*targaddr, _PATH_RPCBINDSOCK, sizeof(sun.sun_path)); } return (client); @@ -598,7 +598,7 @@ __rpcbind_is_up(void) if (sock < 0) return (false); sun.sun_family = AF_LOCAL; - strncpy(sun.sun_path, _PATH_RPCBINDSOCK, sizeof(sun.sun_path)); + strlcpy(sun.sun_path, _PATH_RPCBINDSOCK, sizeof(sun.sun_path)); if (connect(sock, (struct sockaddr *)&sun, sizeof(sun)) < 0) { close(sock); From ec03704495097f65dbfef2f69e362b6da1eae148 Mon Sep 17 00:00:00 2001 From: Daniel Gryniewicz Date: Thu, 31 Jan 2019 14:39:58 -0500 Subject: [PATCH 14/70] Make a header for strl*() Signed-off-by: Daniel Gryniewicz --- src/clnt_perror.c | 2 ++ src/netnamer.c | 2 ++ src/rpc_dplx_internal.h | 8 -------- src/rpc_generic.c | 1 + src/rpcb_clnt.c | 1 + src/strl.h | 40 ++++++++++++++++++++++++++++++++++++++++ src/strlcpy.c | 3 ++- 7 files changed, 48 insertions(+), 9 deletions(-) create mode 100644 src/strl.h diff --git a/src/clnt_perror.c b/src/clnt_perror.c index 8c13e758d3..188e9ced3a 100644 --- a/src/clnt_perror.c +++ b/src/clnt_perror.c @@ -46,6 +46,8 @@ #include #include +#include "strl.h" + static char *auth_errmsg(enum auth_stat); /* diff --git a/src/netnamer.c b/src/netnamer.c index 7d3f06013e..6e70883d36 100644 --- a/src/netnamer.c +++ b/src/netnamer.c @@ -48,6 +48,8 @@ #include #include +#include "strl.h" + static char *OPSYS = "unix"; static char *NETID = "netid.byname"; static char *NETIDFILE = "/etc/netid"; diff --git a/src/rpc_dplx_internal.h b/src/rpc_dplx_internal.h index 2bb89cd8de..626c9e70b1 100644 --- a/src/rpc_dplx_internal.h +++ b/src/rpc_dplx_internal.h @@ -84,14 +84,6 @@ struct rpc_dplx_rec { #define RPC_DPLX_LOCKED 0x00100000 #define RPC_DPLX_UNLOCK 0x00200000 -#ifndef HAVE_STRLCAT -extern size_t strlcat(char *, const char *, size_t); -#endif - -#ifndef HAVE_STRLCPY -extern size_t strlcpy(char *, const char *src, size_t); -#endif - /* in clnt_generic.c */ enum xprt_stat clnt_req_process_reply(SVCXPRT *, struct svc_req *); int clnt_req_xid_cmpf(const struct opr_rbtree_node *lhs, diff --git a/src/rpc_generic.c b/src/rpc_generic.c index 5373117afa..6d7e94ef3d 100644 --- a/src/rpc_generic.c +++ b/src/rpc_generic.c @@ -63,6 +63,7 @@ #include #include "rpc_com.h" +#include "strl.h" void thr_keyfree(void *k) diff --git a/src/rpcb_clnt.c b/src/rpcb_clnt.c index fc5fdb2bba..aee8c704a5 100644 --- a/src/rpcb_clnt.c +++ b/src/rpcb_clnt.c @@ -58,6 +58,7 @@ #include #include "rpc_com.h" +#include "strl.h" /* retry timeout default to the moon and back */ static struct timespec to = { 3, 0 }; diff --git a/src/strl.h b/src/strl.h new file mode 100644 index 0000000000..5d2f3b4455 --- /dev/null +++ b/src/strl.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2019 Red Hat, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR `AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef STRL_H +#define STRL_H + +#include "config.h" + +#ifndef HAVE_STRLCAT +extern size_t strlcat(char *, const char *, size_t); +#endif + +#ifndef HAVE_STRLCPY +extern size_t strlcpy(char *, const char *src, size_t); +#endif + + +#endif /* STRL_H */ diff --git a/src/strlcpy.c b/src/strlcpy.c index 029a78d92b..2038a086d6 100644 --- a/src/strlcpy.c +++ b/src/strlcpy.c @@ -14,10 +14,11 @@ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ -#ifndef HAVE_STRLCPY #include "config.h" +#ifndef HAVE_STRLCPY #include +#include "strl.h" /* * Copy src to string dst of size siz. At most siz-1 characters From f8b5fa57a1d675a99a4e2209839348843e553cb6 Mon Sep 17 00:00:00 2001 From: Madhu Thorat Date: Thu, 7 Mar 2019 13:49:37 -0500 Subject: [PATCH 15/70] Free 'ret' when returning NULL in __rpc_taddr2uaddr_af() Coverity Scan fix: In __rpc_taddr2uaddr_af() free the allocated memory for variable 'ret' when returning NULL from the function. Signed-off-by: Madhu Thorat --- src/rpc_generic.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/rpc_generic.c b/src/rpc_generic.c index 6d7e94ef3d..0b237f3308 100644 --- a/src/rpc_generic.c +++ b/src/rpc_generic.c @@ -788,12 +788,16 @@ __rpc_taddr2uaddr_af(int af, const struct netbuf *nbuf) switch (af) { case AF_INET: - if (nbuf->len < sizeof(*sin)) + if (nbuf->len < sizeof(*sin)) { + mem_free(ret, RETURN_SIZE); return NULL; + } sin = nbuf->buf; if (inet_ntop(af, &sin->sin_addr, namebuf, sizeof(namebuf)) - == NULL) + == NULL) { + mem_free(ret, RETURN_SIZE); return NULL; + } port = ntohs(sin->sin_port); if (sprintf (ret, "%s.%u.%u", namebuf, ((u_int32_t) port) >> 8, @@ -804,8 +808,10 @@ __rpc_taddr2uaddr_af(int af, const struct netbuf *nbuf) break; #ifdef INET6 case AF_INET6: - if (nbuf->len < sizeof(*sin6)) + if (nbuf->len < sizeof(*sin6)) { + mem_free(ret, RETURN_SIZE); return NULL; + } sin6 = nbuf->buf; if (inet_ntop(af, &sin6->sin6_addr, namebuf6, sizeof(namebuf6)) == NULL) { From 4d6067c1064b296043631d58aba286556442f060 Mon Sep 17 00:00:00 2001 From: Daniel Gryniewicz Date: Mon, 1 Apr 2019 13:27:25 -0400 Subject: [PATCH 16/70] NSM - Don't force UDP portmapper lookups On modern Linux, statd only listens on TCP. Forcing UDP for connections causes them to time out. Any system that claims to support TCP but doesn't is so out-of-date we probably have other issues there. Fix it so that TCP actually uses TCP. Signed-off-by: Daniel Gryniewicz --- src/rpcb_clnt.c | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/src/rpcb_clnt.c b/src/rpcb_clnt.c index aee8c704a5..9d613eacfe 100644 --- a/src/rpcb_clnt.c +++ b/src/rpcb_clnt.c @@ -679,21 +679,8 @@ __rpcb_findaddr_timed(rpcprog_t program, rpcvers_t version, rpcvers_t pmapvers = 2; uint16_t port = 0; - /* - * Try UDP only - there are some portmappers out - * there that use UDP only. - */ if (strcmp(nconf->nc_proto, NC_TCP) == 0) { - struct netconfig *newnconf; - - newnconf = getnetconfigent("udp"); - if (!newnconf) { - client = clnt_raw_ncreate(program, version); - client->cl_error.re_status = RPC_UNKNOWNPROTO; - goto error; - } - client = getclnthandle(host, newnconf, &parms.r_addr); - freenetconfigent(newnconf); + client = getclnthandle(host, nconf, &parms.r_addr); } else if (strcmp(nconf->nc_proto, NC_UDP) == 0) client = getclnthandle(host, nconf, &parms.r_addr); else From 1bfa57230d1cdc0d5639078ad37df5f46dbb4725 Mon Sep 17 00:00:00 2001 From: Gaurav Gangalwar Date: Tue, 23 Apr 2019 13:20:38 -0400 Subject: [PATCH 17/70] Close fd if makefd_xprt fails. We are doing accept and returning XPRT_DIED without closing fd, which will be unmonitored and cause fd leak. Client will see connection succeded but IO will hang on it. --- src/svc_vc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/svc_vc.c b/src/svc_vc.c index dda96cd194..2d8fbd29cc 100644 --- a/src/svc_vc.c +++ b/src/svc_vc.c @@ -453,8 +453,10 @@ svc_vc_rendezvous(SVCXPRT *xprt) */ newxprt = makefd_xprt(fd, req_xd->sx_dr.sendsz, req_xd->sx_dr.recvsz, &si, SVC_XPRT_FLAG_CLOSE); - if ((!newxprt) || (!(newxprt->xp_flags & SVC_XPRT_FLAG_INITIAL))) + if ((!newxprt) || (!(newxprt->xp_flags & SVC_XPRT_FLAG_INITIAL))) { + close(fd); return (XPRT_DIED); + } svc_vc_override_ops(newxprt, xprt); From 6bce5340d858673b00cc46a17c2b9a188dbcd902 Mon Sep 17 00:00:00 2001 From: Sachin Punadikar Date: Wed, 24 Apr 2019 12:44:27 +0530 Subject: [PATCH 18/70] Use "gss_get_mic" instead of "gss_sign" "gss_sign" is a deprecated GSS-API function. This needs to be replaced by "gss_get_mic". Signed-off-by: Sachin Punadikar --- src/svc_auth_gss.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/svc_auth_gss.c b/src/svc_auth_gss.c index 48437c9b29..15bc73e3b0 100644 --- a/src/svc_auth_gss.c +++ b/src/svc_auth_gss.c @@ -280,8 +280,8 @@ svcauth_gss_accept_sec_context(struct svc_req *req, gss_release_buffer(&min_stat, &gd->checksum); maj_stat = - gss_sign(&min_stat, gd->ctx, GSS_C_QOP_DEFAULT, &seqbuf, - &checksum); + gss_get_mic(&min_stat, gd->ctx, GSS_C_QOP_DEFAULT, &seqbuf, + &checksum); if (maj_stat != GSS_S_COMPLETE) { gss_release_buffer(&min_stat, &gr->gr_token); From 31b3eb150e7a7a0f09134c7953efecbe8c8aafda Mon Sep 17 00:00:00 2001 From: Sachin Punadikar Date: Wed, 24 Apr 2019 19:34:40 +0530 Subject: [PATCH 19/70] Fix memory leak for RPCSEC_GSS The GSS routines expects to release various fields been used by the applicationi, after their use. Fixed the code to release such resources. Signed-off-by: Sachin Punadikar --- src/svc_auth_gss.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/svc_auth_gss.c b/src/svc_auth_gss.c index 15bc73e3b0..6a57cd660c 100644 --- a/src/svc_auth_gss.c +++ b/src/svc_auth_gss.c @@ -291,12 +291,14 @@ svcauth_gss_accept_sec_context(struct svc_req *req, /* XXX ref? (assert gd->locked?) */ if (checksum.length > MAX_AUTH_BYTES){ gss_release_buffer(&min_stat, &gr->gr_token); + gss_release_buffer(&min_stat, &checksum); return (false); } req->rq_msg.RPCM_ack.ar_verf.oa_flavor = RPCSEC_GSS; req->rq_msg.RPCM_ack.ar_verf.oa_length = checksum.length; memcpy(req->rq_msg.RPCM_ack.ar_verf.oa_body, checksum.value, checksum.length); + gss_release_buffer(&min_stat, &checksum); } return (true); } @@ -371,12 +373,14 @@ svcauth_gss_nextverf(struct svc_req *req, struct svc_rpc_gss_data *gd, } if (checksum.length > MAX_AUTH_BYTES) { gss_log_status("checksum.length", maj_stat, min_stat); + gss_release_buffer(&min_stat, &checksum); return (false); } req->rq_msg.RPCM_ack.ar_verf.oa_flavor = RPCSEC_GSS; req->rq_msg.RPCM_ack.ar_verf.oa_length = checksum.length; memcpy(req->rq_msg.RPCM_ack.ar_verf.oa_body, checksum.value, checksum.length); + gss_release_buffer(&min_stat, &checksum); return (true); } From 95155fa201421315e629788b719580357053037f Mon Sep 17 00:00:00 2001 From: Madhu Thorat Date: Fri, 26 Apr 2019 10:22:50 -0400 Subject: [PATCH 20/70] getclnthandle(): Fix assert(client == NULL) and destroy client handle. In getclnthandle() function 'clnt_tli_ncreate()' is called. Currently even if 'clnt_tli_ncreate()' doesn't complete successfully it returns a valid pointer for 'client' and the status is saved in 'client->cl_error.re_status'. But with this later on in getclnthandle() ganesha may abort for condition assert(client == NULL). To fix this we now check if 'client' is not NULL then assert(CLNT_FAILURE(client)) and then destroy the client. Signed-off-by: Madhu Thorat --- src/rpcb_clnt.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/src/rpcb_clnt.c b/src/rpcb_clnt.c index 9d613eacfe..cf77b7a1da 100644 --- a/src/rpcb_clnt.c +++ b/src/rpcb_clnt.c @@ -314,7 +314,15 @@ static CLIENT *getclnthandle(const char *host, const struct netconfig *nconf, mem_free(addr_to_delete.buf, addr_to_delete.len); } if (!__rpc_nconf2sockinfo(nconf, &si)) { - assert(client == NULL); + if (client != NULL) { + /* if client!=NULL then there should + * have been a failure + */ + assert(CLNT_FAILURE(client)); + /* destroy the failed client */ + CLNT_DESTROY(client); + } + __warnx(TIRPC_DEBUG_FLAG_WARN, "%s: %s", __func__, clnt_sperrno(RPC_UNKNOWNPROTO)); client = clnt_raw_ncreate(1, 1); @@ -347,7 +355,15 @@ static CLIENT *getclnthandle(const char *host, const struct netconfig *nconf, goto out_err; } else { if (getaddrinfo(host, "sunrpc", &hints, &res) != 0) { - assert(client == NULL); + if (client != NULL) { + /* if client!=NULL then there should + * have been a failure + */ + assert(CLNT_FAILURE(client)); + /* destroy the failed client */ + CLNT_DESTROY(client); + } + __warnx(TIRPC_DEBUG_FLAG_WARN, "%s: %s", __func__, clnt_sperrno(RPC_UNKNOWNHOST)); client = clnt_raw_ncreate(1, 1); From 1ae7f2ff3f0210412fbff24b6ad09993d653c8b4 Mon Sep 17 00:00:00 2001 From: Sachin Punadikar Date: Thu, 25 Apr 2019 11:49:26 +0530 Subject: [PATCH 21/70] Do not use macro "svcauth_gss_return" The svcauth_gss_return macro was used to return from function _svcauth_gss. But this macro do not carry out required cleanup in failure cases. Removed this macro and used goto block, which takes care of required cleanup. Signed-off-by: Sachin Punadikar --- src/svc_auth_gss.c | 127 ++++++++++++++++++++++++++++----------------- 1 file changed, 79 insertions(+), 48 deletions(-) diff --git a/src/svc_auth_gss.c b/src/svc_auth_gss.c index 6a57cd660c..83221d04f3 100644 --- a/src/svc_auth_gss.c +++ b/src/svc_auth_gss.c @@ -385,15 +385,6 @@ svcauth_gss_nextverf(struct svc_req *req, struct svc_rpc_gss_data *gd, return (true); } -#define svcauth_gss_return(code) \ - do { \ - if (gc) \ - xdr_free((xdrproc_t) xdr_rpc_gss_cred, gc); \ - if (gd_locked) \ - mutex_unlock(&gd->lock); \ - return (code); \ - } while (0) - enum auth_stat _svcauth_gss(struct svc_req *req, bool *no_dispatch) { @@ -404,15 +395,17 @@ _svcauth_gss(struct svc_req *req, bool *no_dispatch) struct rpc_gss_init_res gr; int call_stat, offset; OM_uint32 min_stat; - bool gd_locked = false; bool gd_hashed = false; + enum auth_stat rc = AUTH_OK; /* Initialize reply. */ req->rq_msg.RPCM_ack.ar_verf = _null_auth; /* Unserialize client credentials. */ - if (req->rq_msg.cb_cred.oa_length <= 0) - svcauth_gss_return(AUTH_BADCRED); + if (req->rq_msg.cb_cred.oa_length <= 0) { + rc = AUTH_BADCRED; + goto out; + } gc = (struct rpc_gss_cred *)req->rq_msg.rq_cred_body; memset(gc, 0, sizeof(struct rpc_gss_cred)); @@ -422,25 +415,34 @@ _svcauth_gss(struct svc_req *req, bool *no_dispatch) if (!xdr_rpc_gss_cred(xdrs, gc)) { XDR_DESTROY(xdrs); - svcauth_gss_return(AUTH_BADCRED); + rc = AUTH_BADCRED; + goto cred_free; } XDR_DESTROY(xdrs); /* Check version. */ - if (gc->gc_v != RPCSEC_GSS_VERSION) - svcauth_gss_return(AUTH_BADCRED); + if (gc->gc_v != RPCSEC_GSS_VERSION) { + rc = AUTH_BADCRED; + goto cred_free; + } - if (gc->gc_seq > RPCSEC_GSS_MAXSEQ) - svcauth_gss_return(RPCSEC_GSS_CTXPROBLEM); + if (gc->gc_seq > RPCSEC_GSS_MAXSEQ) { + rc = RPCSEC_GSS_CTXPROBLEM; + goto cred_free; + } - if (gc->gc_proc > RPCSEC_GSS_MAXPROC) - svcauth_gss_return(AUTH_BADCRED); + if (gc->gc_proc > RPCSEC_GSS_MAXPROC) { + rc = AUTH_BADCRED; + goto cred_free; + } /* Check RPCSEC_GSS service. */ if (gc->gc_svc != RPCSEC_GSS_SVC_NONE && gc->gc_svc != RPCSEC_GSS_SVC_INTEGRITY - && gc->gc_svc != RPCSEC_GSS_SVC_PRIVACY) - svcauth_gss_return(AUTH_BADCRED); + && gc->gc_svc != RPCSEC_GSS_SVC_PRIVACY) { + rc = AUTH_BADCRED; + goto cred_free; + } /* Context lookup. */ if ((gc->gc_proc == RPCSEC_GSS_DATA) @@ -452,8 +454,10 @@ _svcauth_gss(struct svc_req *req, bool *no_dispatch) * N.B., we are explicitly allowed to discard contexts * for any reason (e.g., to save space). */ gd = authgss_ctx_hash_get(gc); - if (!gd) - svcauth_gss_return(RPCSEC_GSS_CREDPROBLEM); + if (!gd) { + rc = RPCSEC_GSS_CREDPROBLEM; + goto cred_free; + } gd_hashed = true; if (gc->gc_svc != gd->sec.svc) gd->sec.svc = gc->gc_svc; @@ -470,7 +474,6 @@ _svcauth_gss(struct svc_req *req, bool *no_dispatch) /* Serialize context. */ mutex_lock(&gd->lock); - gd_locked = true; /* thread auth */ req->rq_auth = gd->auth; @@ -479,7 +482,8 @@ _svcauth_gss(struct svc_req *req, bool *no_dispatch) if (gd->established) { if (get_time_fast() >= gd->endtime) { *no_dispatch = true; - svcauth_gss_return(RPCSEC_GSS_CREDPROBLEM); + rc = RPCSEC_GSS_CREDPROBLEM; + goto gd_free; } /* XXX implied serialization? or just fudging? advance if @@ -492,7 +496,8 @@ _svcauth_gss(struct svc_req *req, bool *no_dispatch) offset = 0; } else if (offset >= gd->win || (gd->seqmask & (1 << offset))) { *no_dispatch = true; - svcauth_gss_return(AUTH_OK); + mutex_unlock(&gd->lock); + goto cred_free; } gd->seqmask |= (1 << offset); /* XXX harmless */ @@ -508,21 +513,28 @@ _svcauth_gss(struct svc_req *req, bool *no_dispatch) case RPCSEC_GSS_INIT: case RPCSEC_GSS_CONTINUE_INIT: - if (req->rq_msg.cb_proc != NULLPROC) - svcauth_gss_return(AUTH_FAILED); /* XXX ? */ + if (req->rq_msg.cb_proc != NULLPROC) { + rc = AUTH_FAILED; /* XXX ? */ + goto gd_free; + } /* XXX why unconditionally acquire creds? */ - if (!svcauth_gss_acquire_cred()) - svcauth_gss_return(AUTH_FAILED); + if (!svcauth_gss_acquire_cred()) { + rc = AUTH_FAILED; + goto gd_free; + } - if (!svcauth_gss_accept_sec_context(req, gd, &gr)) - svcauth_gss_return(AUTH_REJECTEDCRED); + if (!svcauth_gss_accept_sec_context(req, gd, &gr)) { + rc = AUTH_REJECTEDCRED; + goto gd_free; + } if (!svcauth_gss_nextverf(req, gd, htonl(gr.gr_win))) { /* XXX check */ gss_release_buffer(&min_stat, &gr.gr_token); mem_free(gr.gr_ctx.value, 0); - svcauth_gss_return(AUTH_FAILED); + rc = AUTH_FAILED; + goto gd_free; } *no_dispatch = true; @@ -537,8 +549,10 @@ _svcauth_gss(struct svc_req *req, bool *no_dispatch) gss_release_buffer(&min_stat, &gd->checksum); mem_free(gr.gr_ctx.value, 0); - if (call_stat >= XPRT_DIED) - svcauth_gss_return(AUTH_FAILED); + if (call_stat >= XPRT_DIED) { + rc = AUTH_FAILED; + goto gd_free; + } if (gr.gr_major == GSS_S_COMPLETE) { gd->established = true; @@ -586,30 +600,38 @@ _svcauth_gss(struct svc_req *req, bool *no_dispatch) call_stat = svcauth_gss_validate(req, gd); switch (call_stat) { default: - svcauth_gss_return(RPCSEC_GSS_CREDPROBLEM); + rc = RPCSEC_GSS_CREDPROBLEM; + goto gd_free; case 0: break; } - if (!svcauth_gss_nextverf(req, gd, htonl(gc->gc_seq))) - svcauth_gss_return(AUTH_FAILED); + if (!svcauth_gss_nextverf(req, gd, htonl(gc->gc_seq))) { + rc = AUTH_FAILED; + goto gd_free; + } break; case RPCSEC_GSS_DESTROY: - if (req->rq_msg.cb_proc != NULLPROC) - svcauth_gss_return(AUTH_FAILED); /* XXX ? */ + if (req->rq_msg.cb_proc != NULLPROC) { + rc = AUTH_FAILED; /* XXX ? */ + goto gd_free; + } - if (svcauth_gss_validate(req, gd)) - svcauth_gss_return(RPCSEC_GSS_CREDPROBLEM); + if (svcauth_gss_validate(req, gd)) { + rc = RPCSEC_GSS_CREDPROBLEM; + goto gd_free; + } - if (!svcauth_gss_nextverf(req, gd, htonl(gc->gc_seq))) - svcauth_gss_return(AUTH_FAILED); + if (!svcauth_gss_nextverf(req, gd, htonl(gc->gc_seq))) { + rc = AUTH_FAILED; + goto gd_free; + } *no_dispatch = true; /* avoid lock order reversal gd->lock, xprt->xp_lock */ mutex_unlock(&gd->lock); - gd_locked = false; /* This takes gd->lock, so call it after we unlock */ (void)authgss_ctx_hash_del(gd); @@ -628,15 +650,24 @@ _svcauth_gss(struct svc_req *req, bool *no_dispatch) */ unref_svc_rpc_gss_data(gd); req->rq_auth = &svc_auth_none; + goto cred_free; break; default: - svcauth_gss_return(AUTH_REJECTEDCRED); + rc = AUTH_REJECTEDCRED; break; } - - svcauth_gss_return(AUTH_OK); +gd_free: + mutex_unlock(&gd->lock); + if (gd_hashed) { + unref_svc_rpc_gss_data(gd); + gd_hashed = false; + } +cred_free: + xdr_free((xdrproc_t) xdr_rpc_gss_cred, gc); +out: + return rc; } static bool From 9ef94befc35d8493167dadc4f28f10d7252834a6 Mon Sep 17 00:00:00 2001 From: Daniel Gryniewicz Date: Tue, 30 Apr 2019 12:59:38 -0400 Subject: [PATCH 22/70] Mark ANYFD clients as local clients ANYFD clients always get a new FD, so those xprts need to be destroyed. Mark them local, so that they're destroyed properly, to avoid leaking a FD. Signed-off-by: Daniel Gryniewicz --- src/clnt_generic.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/clnt_generic.c b/src/clnt_generic.c index ebde41434e..4e43f7a9b1 100644 --- a/src/clnt_generic.c +++ b/src/clnt_generic.c @@ -401,6 +401,11 @@ clnt_tli_ncreate(int fd, const struct netconfig *nconf, goto err; } + if (flags & CLNT_CREATE_FLAG_CLOSE) { + /* We got a new FD; this makes it a local client */ + cl->cl_flags |= CLNT_FLAG_LOCAL; + } + if (nconf) { cl->cl_netid = mem_strdup(nconf->nc_netid); cl->cl_tp = mem_strdup(nconf->nc_device); From daaece7e4be3a77eafa8513c61f2977d0630b915 Mon Sep 17 00:00:00 2001 From: Daniel Gryniewicz Date: Tue, 30 Apr 2019 13:07:58 -0400 Subject: [PATCH 23/70] Indicate this is a dev version of ntirpc Signed-off-by: Daniel Gryniewicz --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index bba99680e7..11f0b71199 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,7 +15,7 @@ project(NTIRPC C) # version numbers set(NTIRPC_MAJOR_VERSION 1) set(NTIRPC_MINOR_VERSION 7) -set(NTIRPC_PATCH_LEVEL 1) +set(NTIRPC_PATCH_LEVEL 999) set(VERSION_COMMENT "Full-duplex and bi-directional ONC RPC on TCP." ) From c1b95f7519cb3ecbeccdeb69f9d5f534c58383d0 Mon Sep 17 00:00:00 2001 From: Madhu Thorat Date: Fri, 17 May 2019 03:59:23 -0400 Subject: [PATCH 24/70] Don't attempt to destroy XPRT if CLNT create was unsuccessful Currently in clnt_vc_destroy() we call SVC_DESTROY for a XPRT, but if CLNT (client handle) creation failed then the related 'cx->cx_rec' won't be valid and this will lead to a crash. Fixed this by calling SVC_DESTROY only when 'cx->cx_rec' is valid. Signed-off-by: Madhu Thorat --- src/clnt_vc.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/clnt_vc.c b/src/clnt_vc.c index ff83c3fb5a..e680b5532b 100644 --- a/src/clnt_vc.c +++ b/src/clnt_vc.c @@ -460,12 +460,12 @@ clnt_vc_destroy(CLIENT *clnt) if (cx->cx_rec) { SVC_RELEASE(&cx->cx_rec->xprt, SVC_RELEASE_FLAG_NONE); - } - if (clnt->cl_flags & CLNT_FLAG_LOCAL) { - /* Local client; destroy the xprt */ - SVC_DESTROY(&cx->cx_rec->xprt); - } + if (clnt->cl_flags & CLNT_FLAG_LOCAL) { + /* Local client; destroy the xprt */ + SVC_DESTROY(&cx->cx_rec->xprt); + } + } clnt_vc_data_free(CT_DATA(cx)); } From 69752207c54cfd3a3c1289a3745a029251b2f1fd Mon Sep 17 00:00:00 2001 From: Daniel Gryniewicz Date: Fri, 17 May 2019 11:03:44 -0400 Subject: [PATCH 25/70] gss_data - Don't unlock after free Signed-off-by: Daniel Gryniewicz --- ntirpc/rpc/gss_internal.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ntirpc/rpc/gss_internal.h b/ntirpc/rpc/gss_internal.h index d17b6588dd..d20fbd9edc 100644 --- a/ntirpc/rpc/gss_internal.h +++ b/ntirpc/rpc/gss_internal.h @@ -119,6 +119,8 @@ unref_svc_rpc_gss_data(struct svc_rpc_gss_data *gd) /* if refcnt is 0, gd is not reachable */ if (unlikely(atomic_dec_uint32_t(&gd->refcnt) == 0)) { svcauth_gss_destroy(gd->auth); + /* gd was unlocked and freed. */ + return; } mutex_unlock(&gd->lock); From 0886fedcc0266279b103fd8252c0f069f5e08640 Mon Sep 17 00:00:00 2001 From: Daniel Gryniewicz Date: Fri, 17 May 2019 11:31:09 -0400 Subject: [PATCH 26/70] _svcauth_gss - Clean up gss_data handling gss_data (gd) is refcounted. authgss_ctx_hash_get() takes a ref, alloc_svc_rpc_gss_data() takes a ref, and authgss_ctx_hash_set() takes a ref. This means that gd needs to be unref'd on failure, always. In addition, remove gd_hashed, since we need to unref if we allocated it, and it cannot be hashed for RPCSEC_GSS_INIT or RPCSEC_GSS_CONTINUE_INIT. Signed-off-by: Daniel Gryniewicz --- src/svc_auth_gss.c | 80 ++++++++++++++++++++++------------------------ 1 file changed, 38 insertions(+), 42 deletions(-) diff --git a/src/svc_auth_gss.c b/src/svc_auth_gss.c index 83221d04f3..d0cd983fc1 100644 --- a/src/svc_auth_gss.c +++ b/src/svc_auth_gss.c @@ -395,7 +395,6 @@ _svcauth_gss(struct svc_req *req, bool *no_dispatch) struct rpc_gss_init_res gr; int call_stat, offset; OM_uint32 min_stat; - bool gd_hashed = false; enum auth_stat rc = AUTH_OK; /* Initialize reply. */ @@ -403,8 +402,7 @@ _svcauth_gss(struct svc_req *req, bool *no_dispatch) /* Unserialize client credentials. */ if (req->rq_msg.cb_cred.oa_length <= 0) { - rc = AUTH_BADCRED; - goto out; + return AUTH_BADCRED; } gc = (struct rpc_gss_cred *)req->rq_msg.rq_cred_body; @@ -458,7 +456,6 @@ _svcauth_gss(struct svc_req *req, bool *no_dispatch) rc = RPCSEC_GSS_CREDPROBLEM; goto cred_free; } - gd_hashed = true; if (gc->gc_svc != gd->sec.svc) gd->sec.svc = gc->gc_svc; } @@ -496,8 +493,7 @@ _svcauth_gss(struct svc_req *req, bool *no_dispatch) offset = 0; } else if (offset >= gd->win || (gd->seqmask & (1 << offset))) { *no_dispatch = true; - mutex_unlock(&gd->lock); - goto cred_free; + goto gd_free; } gd->seqmask |= (1 << offset); /* XXX harmless */ @@ -556,40 +552,38 @@ _svcauth_gss(struct svc_req *req, bool *no_dispatch) if (gr.gr_major == GSS_S_COMPLETE) { gd->established = true; - if (!gd_hashed) { - - /* krb5 pac -- try all that apply */ - gss_buffer_desc attr, display_buffer; - - /* completely generic */ - int auth = 1, comp = 0, more = -1; - - memset(&gd->pac.ms_pac, 0, - sizeof(gss_buffer_desc)); - memset(&display_buffer, 0, - sizeof(gss_buffer_desc)); - - /* MS AD */ - attr.value = "urn:mspac:"; - attr.length = 10; - - gr.gr_major = - gss_get_name_attribute(&gr.gr_minor, - gd->client_name, - &attr, &auth, &comp, - &gd->pac.ms_pac, - &display_buffer, - &more); - - if (gr.gr_major == GSS_S_COMPLETE) { - /* dont need it */ - gss_release_buffer(&gr.gr_minor, - &display_buffer); - gd->flags |= SVC_RPC_GSS_FLAG_MSPAC; - } - - (void)authgss_ctx_hash_set(gd); + + /* krb5 pac -- try all that apply */ + gss_buffer_desc attr, display_buffer; + + /* completely generic */ + int auth = 1, comp = 0, more = -1; + + memset(&gd->pac.ms_pac, 0, + sizeof(gss_buffer_desc)); + memset(&display_buffer, 0, + sizeof(gss_buffer_desc)); + + /* MS AD */ + attr.value = "urn:mspac:"; + attr.length = 10; + + gr.gr_major = + gss_get_name_attribute(&gr.gr_minor, + gd->client_name, + &attr, &auth, &comp, + &gd->pac.ms_pac, + &display_buffer, + &more); + + if (gr.gr_major == GSS_S_COMPLETE) { + /* dont need it */ + gss_release_buffer(&gr.gr_minor, + &display_buffer); + gd->flags |= SVC_RPC_GSS_FLAG_MSPAC; } + + (void)authgss_ctx_hash_set(gd); } break; @@ -660,13 +654,15 @@ _svcauth_gss(struct svc_req *req, bool *no_dispatch) } gd_free: mutex_unlock(&gd->lock); - if (gd_hashed) { + + if (rc != AUTH_OK) { + /* On success, the ref gets returned to the caller */ unref_svc_rpc_gss_data(gd); - gd_hashed = false; } + cred_free: xdr_free((xdrproc_t) xdr_rpc_gss_cred, gc); -out: + return rc; } From 0d41ced464bdc34379d692feb0b9468312e9755f Mon Sep 17 00:00:00 2001 From: Daniel Gryniewicz Date: Tue, 28 May 2019 10:42:50 -0400 Subject: [PATCH 27/70] SVC - Don't double release xprt on write error On write error, svc_ioq_flushv() was destroying the xprt. This caused a double-unref, since svc_ioq_write() was unconditionally releasing it's ref. Instead, have svc_ioq_flushv() return an error, and let svc_ioq_write() handle releasing/destroying. Signed-off-by: Daniel Gryniewicz --- src/svc_ioq.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/src/svc_ioq.c b/src/svc_ioq.c index e99b55ee7d..88068c3615 100644 --- a/src/svc_ioq.c +++ b/src/svc_ioq.c @@ -117,7 +117,7 @@ svc_ioq_init(void) #define LAST_FRAG ((u_int32_t)(1 << 31)) #define MAXALLOCA (256) -static inline void +static inline int svc_ioq_flushv(SVCXPRT *xprt, struct xdr_ioq *xioq) { struct iovec *iov, *tiov, *wiov; @@ -130,6 +130,7 @@ svc_ioq_flushv(SVCXPRT *xprt, struct xdr_ioq *xioq) u_int32_t vsize = (xioq->ioq_uv.uvqh.qcount + 1) * sizeof(struct iovec); int iw = 0; int ix = 1; + int rc = 0; if (unlikely(vsize > MAXALLOCA)) { iov = mem_alloc(vsize); @@ -195,7 +196,7 @@ svc_ioq_flushv(SVCXPRT *xprt, struct xdr_ioq *xioq) __warnx(TIRPC_DEBUG_FLAG_ERROR, "%s() writev failed (%d)\n", __func__, errno); - SVC_DESTROY(xprt); + rc = -1; break; } fbytes -= result; @@ -216,6 +217,8 @@ svc_ioq_flushv(SVCXPRT *xprt, struct xdr_ioq *xioq) if (unlikely(vsize > MAXALLOCA)) { mem_free(iov, vsize); } + + return rc; } static void @@ -224,13 +227,22 @@ svc_ioq_write(SVCXPRT *xprt, struct xdr_ioq *xioq, struct poolq_head *ifph) struct poolq_entry *have; for (;;) { + int rc = 0; + /* do i/o unlocked */ if (svc_work_pool.params.thrd_max && !(xprt->xp_flags & SVC_XPRT_FLAG_DESTROYED)) { /* all systems are go! */ - svc_ioq_flushv(xprt, xioq); + rc = svc_ioq_flushv(xprt, xioq); } - SVC_RELEASE(xprt, SVC_RELEASE_FLAG_NONE); + + if (rc < 0) { + /* IO failed, destroy rather than releasing */ + SVC_DESTROY(xprt); + } else { + SVC_RELEASE(xprt, SVC_RELEASE_FLAG_NONE); + } + XDR_DESTROY(xioq->xdrs); mutex_lock(&ifph->qmutex); From 3b7415c37ab2cc08b9c8d00862af23d2db23d9aa Mon Sep 17 00:00:00 2001 From: Daniel Gryniewicz Date: Thu, 30 May 2019 13:44:56 -0400 Subject: [PATCH 28/70] 1.8.0 Signed-off-by: Daniel Gryniewicz --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 11f0b71199..878dcf40b3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,8 +14,8 @@ project(NTIRPC C) # version numbers set(NTIRPC_MAJOR_VERSION 1) -set(NTIRPC_MINOR_VERSION 7) -set(NTIRPC_PATCH_LEVEL 999) +set(NTIRPC_MINOR_VERSION 8) +set(NTIRPC_PATCH_LEVEL 0) set(VERSION_COMMENT "Full-duplex and bi-directional ONC RPC on TCP." ) From dbfec91067a5e755c1225dbada1f457115caec5b Mon Sep 17 00:00:00 2001 From: Madhu Thorat Date: Sun, 7 Jul 2019 20:09:26 -0400 Subject: [PATCH 29/70] In _svcauth_gss() in case of failure set 'req->rq_auth' to NULL Currently in _svcauth_gss() after having a valid 'gd' we increment 'gd->refcnt' and set 'req->rq_auth=gd->auth'. In case of failure for a valid 'gd' we call unref_svc_rpc_gss_data() which decrements 'gd->refcnt' and may destroy 'gd' if 'gd->refcnt' becomes 0. But later on when free_nfs_request() is called, as 'req->rq_auth' is still valid, we call SVCAUTH_RELEASE() which either leads to a crash if 'gd' was already freed or if 'gd' was not freed then we decrement 'gd->refcnt' for the second time. Fixed this by setting 'req->rq_auth=NULL' in case of failure in _svcauth_gss(). Signed-off-by: Madhu Thorat --- src/svc_auth_gss.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/svc_auth_gss.c b/src/svc_auth_gss.c index d0cd983fc1..3fc221bbb5 100644 --- a/src/svc_auth_gss.c +++ b/src/svc_auth_gss.c @@ -658,6 +658,7 @@ _svcauth_gss(struct svc_req *req, bool *no_dispatch) if (rc != AUTH_OK) { /* On success, the ref gets returned to the caller */ unref_svc_rpc_gss_data(gd); + req->rq_auth = NULL; } cred_free: From 1cab741ae92e28185d687d9021458c98b57ac27d Mon Sep 17 00:00:00 2001 From: "Frank S. Filz" Date: Thu, 28 Mar 2019 11:07:06 -0700 Subject: [PATCH 30/70] xdr_ioq.c: fix xdr_ioq_setpos to deal with partially full buffers If a vector includes some partially filled buffers, setpos must position into the next buffer if it would land at or past the tail of a buffer that is not the terminal buffer. This will be required if putbufs is made to work. It is also required for gss_wrap_iov where we need to insert a header just before the response data after the response data has already been encoded. This means the response data MUST start in a new buffer, leaving the previous buffer potentially partially full. Signed-off-by: Frank S. Filz --- src/xdr_ioq.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/xdr_ioq.c b/src/xdr_ioq.c index 35f6ed4a32..1f25153f0e 100644 --- a/src/xdr_ioq.c +++ b/src/xdr_ioq.c @@ -664,13 +664,20 @@ xdr_ioq_setpos(XDR *xdrs, u_int pos) TAILQ_FOREACH(have, &(XIOQ(xdrs)->ioq_uv.uvqh.qh), q) { struct xdr_ioq_uv *uv = IOQ_(have); + struct xdr_ioq_uv *next = IOQ_(TAILQ_NEXT(have, q)); u_int len = ioquv_length(uv); u_int full = (uintptr_t)xdrs->x_v.vio_wrap - (uintptr_t)xdrs->x_v.vio_head; - if (pos <= full) { - /* allow up to the end of the buffer, - * assuming next operation will extend. + /* If we have a next buffer and pos would land exactly at the + * tail of this buffer, we want to force positioning in the + * next buffer. The space between the tail of this buffer and + * the wrap of this buffer is unused and MUST be skipped. + */ + if ((pos < len) || (next == NULL && pos <= full)) { + /* allow up to the end of the buffer, unless there is + * a next buffer in which case only allow up to the + * tail assuming next operation will extend. */ xdrs->x_data = uv->v.vio_head + pos; xdrs->x_base = &uv->v; From 6a45a1c96f459c3317fd371742b4f05c8fdaea4d Mon Sep 17 00:00:00 2001 From: "Frank S. Filz" Date: Fri, 10 May 2019 15:41:46 -0700 Subject: [PATCH 31/70] Add XDR functions to support gss iov functions Add four new interfaces to xdr_ops XDR_NEWBUFS - forces a vector xdr like xdr_ioq to start a new buffer so that later a GSS HEADER buffer can be inserted. XDR_IOVCOUNT - counts the number of xdr_iov buffers occupied in an xdr stream from pos for datalen bytes. XDR_FILLBUFS - fills a new xdr_iov vector with the buffer pointers to an xdr stream from pos for datalen bytes. XDR_ALLOCHDRS - allocates space in an xdr stream for HEADER and TRAILER buffers. If the xdr allows, a HEADER buffer may be inserted just before pos. All xdr types should be able to support appending TRAILER buffers. Signed-off-by: Frank S. Filz --- ntirpc/rpc/xdr.h | 38 +++- src/xdr_ioq.c | 459 ++++++++++++++++++++++++++++++++++++++++++++++- src/xdr_mem.c | 78 ++++++++ 3 files changed, 569 insertions(+), 6 deletions(-) diff --git a/ntirpc/rpc/xdr.h b/ntirpc/rpc/xdr.h index 240d750731..423c8bd036 100644 --- a/ntirpc/rpc/xdr.h +++ b/ntirpc/rpc/xdr.h @@ -117,12 +117,23 @@ enum xdr_op { * BYTES_PER_XDR_UNIT) #endif +/* XDR vector buffer types */ +typedef enum vio_type { + VIO_HEADER, /* header buffer before data */ + VIO_DATA, /* data buffer */ + VIO_TRAILER, /* trailer buffer after data */ + VIO_TRAILER_LEN, /* trailer buffer that needs a length ahead */ +} vio_type; + /* XDR buffer vector descriptors */ typedef struct xdr_vio { uint8_t *vio_base; uint8_t *vio_head; /* minimum vio_tail (header offset) */ - uint8_t *vio_tail; + uint8_t *vio_tail; /* end of the used part of the buffer */ uint8_t *vio_wrap; /* maximum vio_tail */ + uint32_t vio_length; /* length of buffer, used for vector + pre-allocation */ + vio_type vio_type; /* type of buffer */ } xdr_vio; /* vio_wrap >= vio_tail >= vio_head >= vio_base */ @@ -188,6 +199,14 @@ typedef struct rpc_xdr { /* new vector and refcounted interfaces */ bool (*x_getbufs)(struct rpc_xdr *, xdr_uio *, u_int); bool (*x_putbufs)(struct rpc_xdr *, xdr_uio *, u_int); + /* Force a new buffer to start (or fail) */ + bool (*x_newbuf)(struct rpc_xdr *); + /* Return the count of buffers in the vector from pos */ + int (*x_iovcount)(struct rpc_xdr *, u_int, u_int); + /* Fill xdr_vio with buffers from pos */ + bool (*x_fillbufs)(struct rpc_xdr *, u_int , xdr_vio *, u_int); + /* Allocate bufs for headers and trailers and insert into vio */ + bool (*x_allochdrs)(struct rpc_xdr *, u_int , xdr_vio *, int); } *x_ops; void *x_public; /* users' data */ void *x_private; /* pointer to private data */ @@ -331,6 +350,23 @@ xdr_putlong(XDR *xdrs, const long *lp) (*(xdrs)->x_ops->x_control)(xdrs, req, op) #define xdr_control(xdrs, req, op) XDR_CONTROL(xdrs, req, op) +#define XDR_NEWBUF(xdrs) \ + (*(xdrs)->x_ops->x_newbuf)(xdrs) +#define xdr_newbuf(xdrs, pos) XDR_NEWBUF(xdrs) + +#define XDR_IOVCOUNT(xdrs, pos, len) \ + (*(xdrs)->x_ops->x_iovcount)(xdrs, pos, len) +#define xdr_iovcount(xdrs, pos, len) XDR_IOVCOUNT(xdrs, pos, len) + +#define XDR_FILLBUFS(xdrs, pos, iov, len) \ + (*(xdrs)->x_ops->x_fillbufs)(xdrs, pos, iov, len) +#define xdr_fillbufs(xdrs, pos, iov, len) XDR_FILLBUFS(xdrs, pos, iov, len) + +#define XDR_ALLOCHDRS(xdrs, pos, iov, iov_count) \ + (*(xdrs)->x_ops->x_allochdrs)(xdrs, pos, iov, iov_count) +#define xdr_allochdrs(xdrs, pos, iov, iov_count) \ + XDR_ALLOCHDRS(xdrs, pos, iov, iov_count) + /* * Support struct for discriminated unions. * You create an array of xdrdiscrim structures, terminated with diff --git a/src/xdr_ioq.c b/src/xdr_ioq.c index 1f25153f0e..b4938abf32 100644 --- a/src/xdr_ioq.c +++ b/src/xdr_ioq.c @@ -53,8 +53,6 @@ #include -static bool xdr_ioq_noop(void) __attribute__ ((unused)); - #define VREC_MAXBUFS 24 static uint64_t next_id; @@ -82,6 +80,10 @@ xdr_ioq_uv_create(size_t size, u_int uio_flags) uv->u.uio_flags = uio_flags; uv->u.uio_references = 1; /* starting one */ + __warnx(TIRPC_DEBUG_FLAG_XDR, + "%s() uv %p size %lu", + __func__, uv, (unsigned long) size); + return (uv); } @@ -327,6 +329,13 @@ xdr_ioq_uv_advance(struct xdr_ioq *xioq) len = ioquv_length(uv); xioq->ioq_uv.plength += len; +#if 0 + __warnx(TIRPC_DEBUG_FLAG_XDR, + "%s() uv %p len %lu plength %lu NEXT %p", + __func__, uv, (unsigned long) len, (unsigned long) xioq->ioq_uv.plength, + IOQ_(TAILQ_NEXT(&uv->uvq, q))); +#endif + /* next buffer, if any */ return IOQ_(TAILQ_NEXT(&uv->uvq, q)); } @@ -669,6 +678,14 @@ xdr_ioq_setpos(XDR *xdrs, u_int pos) u_int full = (uintptr_t)xdrs->x_v.vio_wrap - (uintptr_t)xdrs->x_v.vio_head; + __warnx(TIRPC_DEBUG_FLAG_XDR, + "%s Examining xdr_ioq_uv %p (base %p head %p tail %p wrap %p len %lu full %lu) - %s pos %lu", + __func__, uv, uv->v.vio_base, uv->v.vio_head, + uv->v.vio_tail, uv->v.vio_wrap, + (unsigned long) len, (unsigned long) full, + next ? "more" : "last", + (unsigned long) pos); + /* If we have a next buffer and pos would land exactly at the * tail of this buffer, we want to force positioning in the * next buffer. The space between the tail of this buffer and @@ -689,6 +706,10 @@ xdr_ioq_setpos(XDR *xdrs, u_int pos) XIOQ(xdrs)->ioq_uv.pcount++; } + __warnx(TIRPC_DEBUG_FLAG_XDR, + "%s failing with remaining %lu", + __func__, (unsigned long) pos); + return (false); } @@ -767,9 +788,433 @@ xdr_ioq_control(XDR *xdrs, /* const */ int rq, void *in) } static bool -xdr_ioq_noop(void) +xdr_ioq_newbuf(XDR *xdrs) { - return (false); + struct xdr_ioq_uv *uv; + + /* We need to start a new buffer whether the current buffer is full or + * not. + */ + uv = xdr_ioq_uv_advance(XIOQ(xdrs)); + + if (!uv) + uv = xdr_ioq_uv_append(XIOQ(xdrs), IOQ_FLAG_BALLOC); + else + xdr_ioq_uv_update(XIOQ(xdrs), uv); + + __warnx(TIRPC_DEBUG_FLAG_XDR, + "%s() uv %p", + __func__, uv); + + /* At this point, the position has been updated to point to the + * start of the new buffer since xdr_ioq_uv_update has been called + * (it's called at the end of xdr_ioq_uv_append). + */ + return true; +} + +static int +xdr_ioq_iovcount(XDR *xdrs, u_int start, u_int datalen) +{ + /* Buffers starts at -1 to indicate start has not yet been found */ + int buffers = -1; + struct poolq_entry *have; + struct xdr_ioq_uv *uv; + + /* update the most recent data length, just in case */ + xdr_tail_update(xdrs); + + TAILQ_FOREACH(have, &(XIOQ(xdrs)->ioq_uv.uvqh.qh), q) { + u_int len; + + uv = IOQ_(have); + len = ioquv_length(uv); + + __warnx(TIRPC_DEBUG_FLAG_XDR, + "%s Examining xdr_ioq_uv %p (base %p head %p tail %p wrap %p) - start %lu len %lu buffers %d", + __func__, uv, uv->v.vio_base, uv->v.vio_head, + uv->v.vio_tail, uv->v.vio_wrap, + (unsigned long) start, (unsigned long) len, buffers); + + if (buffers > 0) { + /* Accumulate another buffer */ + buffers++; + __warnx(TIRPC_DEBUG_FLAG_XDR, + "Accumulated another buffer total = %d", + buffers); + } else if (start < len) { + /* We have found the buffer that start begins. */ + buffers = 1; + __warnx(TIRPC_DEBUG_FLAG_XDR, + "Starting total = %d", buffers); + } else { + /* Keep looking, need to reduce start by the length of + * this buffer. + */ + start -= len; + } + if (buffers > 0) { + /* Now we need to decrement the datalen to see if we're + * done. Note the first time we come in, start may not + * be zero, which represents the fact that start was in + * the middle of this buffer, just subtract the + * remaining start from the length of this buffer. + */ + u_int buflen = uv->v.vio_tail - uv->v.vio_head - start; + if (buflen >= datalen) { + /* We have found end. */ + datalen = 0; + break; + } + + /* Decrement the datalen, and zero out start for future + * buffers. + */ + datalen -= buflen; + start = 0; + } + } + + if (datalen != 0) { + /* There wasn't enough data... */ + __warnx(TIRPC_DEBUG_FLAG_XDR, + "%s start %lu remain %lu", + __func__, (unsigned long) start, + (unsigned long) datalen); + return -1; + } + + __warnx(TIRPC_DEBUG_FLAG_XDR, + "%s start %lu buffers %d", + __func__, (unsigned long) start, buffers); + + /* If start was not within the xdr stream, buffers will still be -1 */ + return buffers; +} + +static bool +xdr_ioq_fillbufs(XDR *xdrs, u_int start, xdr_vio *vector, u_int datalen) +{ + bool found = false; + struct poolq_entry *have; + struct xdr_ioq_uv *uv; + int idx = 0; + + /* update the most recent data length, just in case */ + xdr_tail_update(xdrs); + + TAILQ_FOREACH(have, &(XIOQ(xdrs)->ioq_uv.uvqh.qh), q) { + u_int len; + + uv = IOQ_(have); + len = ioquv_length(uv); + + __warnx(TIRPC_DEBUG_FLAG_XDR, + "%s Examining xdr_ioq_uv %p (base %p head %p tail %p wrap %p len %lu) - %s start %lu remain %lu idx %d", + __func__, uv, uv->v.vio_base, uv->v.vio_head, + uv->v.vio_tail, uv->v.vio_wrap, + (unsigned long) len, + found ? "found" : "not found", + (unsigned long) start, (unsigned long) datalen, idx); + + if (!found) { + if (start < len) { + /* We have found the buffer that start begins. + */ + found = true; + __warnx(TIRPC_DEBUG_FLAG_XDR, "found"); + } else { + /* Keep looking, need to reduce start by the + * length of this buffer. + */ + start -= len; + } + } + + if (found) { + vector[idx] = uv->v; + vector[idx].vio_type = VIO_DATA; + + if (start > 0) { + /* The start position wasn't at the start of + * a buffer, adjust the vio_head of this buffer + * and len and then zero out start for + * future buffers. + */ + len -= start; + vector[idx].vio_head += start; + start = 0; + } + + vector[idx].vio_length = len; + + if (datalen < vector[idx].vio_length) { + /* This is the last buffer, and we're not using + * all of it, adjust vio_length and vio_tail. + */ + vector[idx].vio_length = datalen; + vector[idx].vio_tail = vector[idx].vio_head + + datalen; + datalen = 0; + break; + } else if (datalen == vector[idx].vio_length) { + /* We have reached the end. */ + datalen = 0; + break; + } + + datalen -= vector[idx].vio_length; + + idx++; + } + } + + if (datalen != 0) { + /* There wasn't enough data... */ + __warnx(TIRPC_DEBUG_FLAG_XDR, + "%s start %lu remain %lu", + __func__, (unsigned long) start, + (unsigned long) datalen); + return false; + } + + __warnx(TIRPC_DEBUG_FLAG_XDR, + "%s %s start %lu remain %lu idx %d", + __func__, found ? "found" : "not found", + (unsigned long) start, (unsigned long) datalen, idx); + + return found; +} + +static struct xdr_ioq_uv * +xdr_ioq_use_or_allocate(struct xdr_ioq *xioq, xdr_vio *v, struct xdr_ioq_uv *uv) +{ + struct poolq_entry *have = &uv->uvq, *have2; + struct xdr_ioq_uv *uv2; + + /* We have a header or tailer, let's see if it fits in this buffer, + * otherwise allocate and insert a new buffer. + */ + uint32_t htlen = v->vio_length; + + if (v->vio_type == VIO_TRAILER_LEN) { + /* First we need to fit in and encode the length of the trailer + */ + xdr_vio vlen; + + __warnx(TIRPC_DEBUG_FLAG_XDR, + "%s Fitting length xdr_ioq_uv %p (base %p head %p tail %p wrap %p) size %lu length %lu has %lu looking for 4", + __func__, uv, uv->v.vio_base, uv->v.vio_head, + uv->v.vio_tail, uv->v.vio_wrap, + (unsigned long) ioquv_size(uv), + (unsigned long) ioquv_length(uv), + (unsigned long) ioquv_more(uv)); + + /* Set up a dummy xdr_vio for the length */ + memset(&vlen, 0, sizeof(vlen)); + vlen.vio_type = VIO_TRAILER; + vlen.vio_length = 4; + + /* Now recursively call to get space for the length */ + uv = xdr_ioq_use_or_allocate(xioq, &vlen, uv); + + /* Now we have space, either in the previous buffer or a new + * buffer, go ahead and encode the length into it. + */ + *((uint32_t *) (vlen.vio_head)) = + (uint32_t) htonl(v->vio_length); + + /* Becasue we have already set up the gss_iov, it's ok to + * sneak the length it, it won't be part of the gss_iov but it + * IS part of the xdr_iov. + */ + } + + __warnx(TIRPC_DEBUG_FLAG_XDR, + "%s Examining xdr_ioq_uv %p (base %p head %p tail %p wrap %p) size %lu length %lu has %lu looking for %lu", + __func__, uv, uv->v.vio_base, uv->v.vio_head, + uv->v.vio_tail, uv->v.vio_wrap, + (unsigned long) ioquv_size(uv), (unsigned long) ioquv_length(uv), + (unsigned long) ioquv_more(uv), htlen); + + if (ioquv_more(uv) >= htlen) { + /* The HEADER will fit */ + v->vio_base = uv->v.vio_base; + v->vio_head = uv->v.vio_tail; + v->vio_tail = uv->v.vio_tail + htlen; + v->vio_wrap = uv->v.vio_wrap; + + /* Fixup tail of this buffer */ + uv->v.vio_tail = v->vio_tail; + } else { + /* We have to allocate and insert a new buffer */ + if (xioq->ioq_uv.uvq_fetch) { + /** @todo: does this actually work? */ + /* more of the same kind */ + have2 = + xioq->ioq_uv.uvq_fetch( + xioq, uv->u.uio_p1, + "next buffer", 1, + IOQ_FLAG_NONE); + + /* poolq_entry is the top element of xdr_ioq_uv + */ + uv2 = IOQ_(have2); + assert((void *)uv2 == (void *)have2); + } else { + uv2 = xdr_ioq_uv_create(xioq->ioq_uv.min_bsize, + UIO_FLAG_FREE); + have2 = &uv2->uvq; + (xioq->ioq_uv.uvqh.qcount)++; + TAILQ_INSERT_AFTER(&xioq->ioq_uv.uvqh.qh, + have, have2, q); + + /* Advance to new buffer */ + uv = uv2; + have = have2; + } + + /* Now set up for the header in the new buffer */ + v->vio_base = uv->v.vio_base; + v->vio_head = uv->v.vio_head; + v->vio_tail = uv->v.vio_head + htlen; + v->vio_wrap = uv->v.vio_wrap; + + /* Fixup tail of this buffer */ + uv->v.vio_tail = v->vio_tail; + } + + __warnx(TIRPC_DEBUG_FLAG_XDR, + "%s Produced xdr_ioq_uv %p (base %p head %p tail %p wrap %p) size %lu length %lu", + __func__, uv, uv->v.vio_base, uv->v.vio_head, + uv->v.vio_tail, uv->v.vio_wrap, + (unsigned long) ioquv_size(uv), + (unsigned long) ioquv_length(uv)); + + return uv; +} + +static bool +xdr_ioq_allochdrs(XDR *xdrs, u_int start, xdr_vio *vector, int iov_count) +{ + bool found = false; + struct xdr_ioq_uv *uv; + int idx = 0; + struct xdr_ioq *xioq = XIOQ(xdrs); + struct poolq_entry *have; + + /* update the most recent data length, just in case */ + xdr_tail_update(xdrs); + + TAILQ_FOREACH(have, &(XIOQ(xdrs)->ioq_uv.uvqh.qh), q) { + u_int len; + + uv = IOQ_(have); + len = ioquv_length(uv); + + __warnx(TIRPC_DEBUG_FLAG_XDR, + "%s Examining xdr_ioq_uv %p (base %p head %p tail %p wrap %p) - %s start %lu len %lu", + __func__, uv, uv->v.vio_base, uv->v.vio_head, + uv->v.vio_tail, uv->v.vio_wrap, + found ? "found" : "not found", + (unsigned long) start, (unsigned long) len); + + if (start < len) { + /* start is in this buffer, but not at the start. + * This should be the first data buffer. + */ + found = true; + break; + } + + /* Keep looking, need to reduce start by the length of + * this buffer. + */ + start -= len; + + if (start == 0) { + /* We have found the buffer prior to the one + * that begins at start. + */ + __warnx(TIRPC_DEBUG_FLAG_XDR, + "%s found start after %p", + __func__, uv); + found = true; + break; + } + } + + if (!found) { + /* Failure */ + return false; + } + + /* uv and have are the buffer just before start */ + + if (vector[idx].vio_type == VIO_HEADER) { + if (start != 0) { + /* We are leading with a HEADER, but this buffer has + * data beyond start, so we can't insert the HEADER in + * the right place... + */ + __warnx(TIRPC_DEBUG_FLAG_XDR, + "Oops, trying to insert HEADER in the middle of a buffer"); + return false; + } + + /* We have a header, let's see if it fits in this buffer, + * otherwise allocate and insert a new buffer. + */ + __warnx(TIRPC_DEBUG_FLAG_XDR, + "Calling xdr_ioq_use_or_allocate for idx %d for VIO_HEADER", + idx); + + uv = xdr_ioq_use_or_allocate(xioq, &vector[idx], uv); + + /* Advance to next (DATA) buffer */ + idx++; + } + + if (start == 0) { + /* We have the buffer prior to the DATA buffer that should be + * at start, so advance to the next buffer so we will now have + * the first DATA buffer. + */ + uv = IOQ_(TAILQ_NEXT(&uv->uvq, q)); + } + + /* Now idx, uv, and have should be the first DATA buffer */ + while (idx < iov_count && vector[idx].vio_type == VIO_DATA) { + /* Advance to next buffer */ + have = TAILQ_NEXT(have, q); + + __warnx(TIRPC_DEBUG_FLAG_XDR, + "Skipping idx %d for VIO_DATA", + idx); + + if (have != NULL) { + /* Next buffer exists */ + uv = IOQ_(have); + } /* else leave the last DATA buffer */ + + idx++; + } + + /* Now idx, uv, and have are the last DATA buffer */ + + while (idx < iov_count) { + /* Another TRAILER buffer to manage */ + __warnx(TIRPC_DEBUG_FLAG_XDR, + "Calling xdr_ioq_use_or_allocate for idx %d for VIO_TRAILER", + idx); + + uv = xdr_ioq_use_or_allocate(xioq, &vector[idx], uv); + + /* Next vector buffer */ + idx++; + } + + return true; } const struct xdr_ops xdr_ioq_ops = { @@ -782,5 +1227,9 @@ const struct xdr_ops xdr_ioq_ops = { xdr_ioq_destroy_internal, xdr_ioq_control, xdr_ioq_getbufs, - xdr_ioq_putbufs + xdr_ioq_putbufs, + xdr_ioq_newbuf, /* x_newbuf */ + xdr_ioq_iovcount, /* x_iovcount */ + xdr_ioq_fillbufs, /* x_fillbufs */ + xdr_ioq_allochdrs, /* x_allochdrs */ }; diff --git a/src/xdr_mem.c b/src/xdr_mem.c index 71ec90ad50..318193ba24 100644 --- a/src/xdr_mem.c +++ b/src/xdr_mem.c @@ -57,6 +57,7 @@ typedef bool (*dummyfunc3)(XDR *, int, void *); typedef bool (*dummy_getbufs)(XDR *, xdr_uio *, u_int); typedef bool (*dummy_putbufs)(XDR *, xdr_uio *, u_int); +typedef bool (*dummy_newbuf)(struct rpc_xdr *); static const struct xdr_ops xdrmem_ops_aligned; @@ -174,6 +175,79 @@ xdrmem_noop(void) return (false); } +static int +xdrmem_iovcount(XDR *xdrs, u_int start, u_int datalen) +{ + if ((xdrs->x_v.vio_head + start + datalen) > xdrs->x_v.vio_tail) { + /* start and datalen reference outside the size of the data + * in the buffer. + */ + return -1; + } + + return 1; +} + +static bool +xdrmem_fillbufs(XDR *xdrs, u_int start, xdr_vio *vector, u_int datalen) +{ + if ((xdrs->x_v.vio_head + start + datalen) > xdrs->x_v.vio_tail) { + /* start and datalen reference outside the size of the data + * in the buffer. + */ + return false; + } + + vector[0] = xdrs->x_v; + vector[0].vio_type = VIO_DATA; + vector[0].vio_length = vector[0].vio_tail - vector[0].vio_head; + return true; +} + +static bool +xdrmem_allochdrs(XDR *xdrs, u_int start, xdr_vio *vector, int iov_count) +{ + int i; + bool found_data = false; + uint8_t *current = xdrs->x_data; + + for (i = 0; i < iov_count; i++) { + /* If we have found_data and we find another VIO_DATA oops... + * If we have not found_data and we find a non-VIO_DATA oops... + * This simplifies to a single test... + */ + if (found_data == (vector[i].vio_type != VIO_DATA)) { + /* We are being called with a vector we can't support. + * Fixup xdrs and leave. + */ + xdrs->x_data = current; + return false; + } + + if (vector[i].vio_type != VIO_DATA) { + /* Append a reserved buffer for this */ + uint8_t *future = xdrs->x_data + vector[i].vio_length; + + if (future > xdrs->x_v.vio_wrap) { + /* Not enough space, fixup xdrs and leave */ + xdrs->x_data = current; + return false; + } + vector[i].vio_base = xdrs->x_v.vio_base; + vector[i].vio_head = xdrs->x_data; + vector[i].vio_tail = future; + vector[i].vio_wrap = xdrs->x_v.vio_wrap; + xdrs->x_data = future; + } else { + found_data = true; + } + } + + /* update the most recent data length */ + xdr_tail_update(xdrs); + return true; +} + static const struct xdr_ops xdrmem_ops_aligned = { xdrmem_getunit, xdrmem_putunit, @@ -185,4 +259,8 @@ static const struct xdr_ops xdrmem_ops_aligned = { (dummyfunc3) xdrmem_noop, /* x_control */ (dummy_getbufs) xdrmem_noop, /* x_getbufs */ (dummy_putbufs) xdrmem_noop, /* x_putbufs */ + (dummy_newbuf) xdrmem_noop, /* x_newbuf */ + xdrmem_iovcount, /* x_iovcount */ + xdrmem_fillbufs, /* x_fillbufs */ + xdrmem_allochdrs, /* x_allochdrs */ }; From 9704d8b9dd6b5a71c3500ab74ba112c34c12db12 Mon Sep 17 00:00:00 2001 From: "Frank S. Filz" Date: Tue, 6 Aug 2019 10:18:40 -0700 Subject: [PATCH 32/70] Replace ioq_ifqh with poolq_head in each rpc_dplx_rec This makes it easy to serialize writes for a given SVCXPRT. This will eventually be compressed into the single non-blocking patch. Signed-off-by: Frank S. Filz --- src/rpc_dplx_internal.h | 6 +++++ src/svc.c | 1 - src/svc_ioq.c | 58 +++++------------------------------------ src/svc_ioq.h | 1 - 4 files changed, 13 insertions(+), 53 deletions(-) diff --git a/src/rpc_dplx_internal.h b/src/rpc_dplx_internal.h index 626c9e70b1..b21c848a5e 100644 --- a/src/rpc_dplx_internal.h +++ b/src/rpc_dplx_internal.h @@ -32,6 +32,7 @@ #include #include #include +#include /* Svc event strategy */ enum svc_event_type { @@ -51,6 +52,7 @@ typedef struct rpc_dplx_lock { struct rpc_dplx_rec { struct svc_xprt xprt; /**< Transport Independent handle */ struct xdr_ioq ioq; + struct poolq_head writeq; /**< poolq for write requests */ struct opr_rbtree call_replies; struct opr_rbtree_node fd_node; struct { @@ -109,6 +111,9 @@ rpc_dplx_rec_init(struct rpc_dplx_rec *rec) rpc_dplx_lock_init(&rec->recv.lock); opr_rbtree_init(&rec->call_replies, clnt_req_xid_cmpf); mutex_init(&rec->xprt.xp_lock, NULL); + TAILQ_INIT(&rec->writeq.qh); + mutex_init(&rec->writeq.qmutex, NULL); + rec->writeq.qcount = 0; /* Stop this xprt being cleaned immediately */ (void)clock_gettime(CLOCK_MONOTONIC_FAST, &(rec->recv.ts)); @@ -120,6 +125,7 @@ rpc_dplx_rec_destroy(struct rpc_dplx_rec *rec) { rpc_dplx_lock_destroy(&rec->recv.lock); mutex_destroy(&rec->xprt.xp_lock); + mutex_destroy(&rec->writeq.qmutex); #if defined(HAVE_BLKIN) if (rec->xprt.blkin.svc_name) diff --git a/src/svc.c b/src/svc.c index 6932033ece..faf3dfec40 100644 --- a/src/svc.c +++ b/src/svc.c @@ -189,7 +189,6 @@ svc_init(svc_init_params *params) if (work_pool_params.thrd_max < work_pool_params.thrd_min) work_pool_params.thrd_max = work_pool_params.thrd_min; - svc_ioq_init(); if (work_pool_init(&svc_work_pool, "svc_", &work_pool_params)) { mutex_unlock(&__svc_params->mtx); return false; diff --git a/src/svc_ioq.c b/src/svc_ioq.c index 88068c3615..059ec19172 100644 --- a/src/svc_ioq.c +++ b/src/svc_ioq.c @@ -66,54 +66,6 @@ #include #include "svc_ioq.h" -/* Send queues, configurable using RPC_Ioq_ThrdMax - * - * Ideally, these would be some variant of weighted fair queuing. Currently, - * assuming supplied by underlying OS. - * - * The assigned thread should have affinity for the interface. Therefore, the - * first thread arriving for each interface is used for all subsequent work, - * until the interface is idle. This assumes that the output interface is - * closely associated with the input interface. - * - * Note that this is a fixed size list of interfaces. In most cases, - * many of these entries will be unused. - * - * For efficiency, a mask is applied to the ifindex, possibly causing overlap of - * multiple interfaces. The size is selected to be larger than expected number - * of concurrently active interfaces. Size must be a power of 2 for mask. - */ -static int num_send_queues; /* must be a power of 2 */ -static struct poolq_head *ioq_ifqh; - -static inline int -svc_ioq_mask(int fd) -{ - return fd & (num_send_queues - 1); /* num_send_queues is a power of 2 */ -} - -void -svc_ioq_init(void) -{ - struct poolq_head *ifph; - int i; - - /* We would like to make the number of send queues close to half - * of the thrd_max. Also, the number of send queues must be a - * power 2 for quick bitmask hashig! - */ - num_send_queues = 1; - while (num_send_queues * 2 < __svc_params->ioq.thrd_max / 2) - num_send_queues <<= 1; - - ioq_ifqh = mem_calloc(num_send_queues, sizeof(struct poolq_head)); - for (i = 0, ifph = &ioq_ifqh[0]; i < num_send_queues; ifph++, i++) { - ifph->qcount = 0; - TAILQ_INIT(&ifph->qh); - mutex_init(&ifph->qmutex, NULL); - } -} - #define LAST_FRAG ((u_int32_t)(1 << 31)) #define MAXALLOCA (256) @@ -249,6 +201,7 @@ svc_ioq_write(SVCXPRT *xprt, struct xdr_ioq *xioq, struct poolq_head *ifph) if (--(ifph->qcount) == 0) break; + /* Grab next one */ have = TAILQ_FIRST(&ifph->qh); TAILQ_REMOVE(&ifph->qh, have, q); mutex_unlock(&ifph->qmutex); @@ -264,7 +217,8 @@ svc_ioq_write_callback(struct work_pool_entry *wpe) { struct xdr_ioq *xioq = opr_containerof(wpe, struct xdr_ioq, ioq_wpe); SVCXPRT *xprt = (SVCXPRT *)xioq->xdrs[0].x_lib[1]; - struct poolq_head *ifph = &ioq_ifqh[svc_ioq_mask(xprt->xp_fd)]; + struct rpc_dplx_rec *rec = REC_XPRT(xprt); + struct poolq_head *ifph = &rec->writeq; svc_ioq_write(xprt, xioq, ifph); } @@ -272,7 +226,8 @@ svc_ioq_write_callback(struct work_pool_entry *wpe) void svc_ioq_write_now(SVCXPRT *xprt, struct xdr_ioq *xioq) { - struct poolq_head *ifph = &ioq_ifqh[svc_ioq_mask(xprt->xp_fd)]; + struct rpc_dplx_rec *rec = REC_XPRT(xprt); + struct poolq_head *ifph = &rec->writeq; SVC_REF(xprt, SVC_REF_FLAG_NONE); mutex_lock(&ifph->qmutex); @@ -301,7 +256,8 @@ svc_ioq_write_now(SVCXPRT *xprt, struct xdr_ioq *xioq) void svc_ioq_write_submit(SVCXPRT *xprt, struct xdr_ioq *xioq) { - struct poolq_head *ifph = &ioq_ifqh[svc_ioq_mask(xprt->xp_fd)]; + struct rpc_dplx_rec *rec = REC_XPRT(xprt); + struct poolq_head *ifph = &rec->writeq; SVC_REF(xprt, SVC_REF_FLAG_NONE); mutex_lock(&ifph->qmutex); diff --git a/src/svc_ioq.h b/src/svc_ioq.h index d36d4c5e25..550d3f7689 100644 --- a/src/svc_ioq.h +++ b/src/svc_ioq.h @@ -29,7 +29,6 @@ #include #include -void svc_ioq_init(void); void svc_ioq_write_now(SVCXPRT *, struct xdr_ioq *); void svc_ioq_write_submit(SVCXPRT *, struct xdr_ioq *); From c2d4cd94e330516562a660d757bba875dcdb9d4a Mon Sep 17 00:00:00 2001 From: "Frank S. Filz" Date: Thu, 8 Aug 2019 11:19:53 -0700 Subject: [PATCH 33/70] Remove obsolete svc_rqst_run_task This should have actually been removed by this commit: d16697b4d1da8a9216f5a955005b7cd1ac28a913 Simplify epoll task processing Signed-off-by: Frank S. Filz --- src/svc_rqst.c | 26 +------------------------- 1 file changed, 1 insertion(+), 25 deletions(-) diff --git a/src/svc_rqst.c b/src/svc_rqst.c index a31fcf61e3..49f314f885 100644 --- a/src/svc_rqst.c +++ b/src/svc_rqst.c @@ -195,7 +195,6 @@ svc_rqst_lookup_chan(uint32_t chan_id) } /* forward declaration in lieu of moving code {WAS} */ -static void svc_rqst_run_task(struct work_pool_entry *); static void svc_rqst_epoll_loop(struct work_pool_entry *wpe); static void svc_complete_task(struct svc_rqst_rec *sr_rec, bool finished); @@ -290,7 +289,7 @@ svc_rqst_new_evchan(uint32_t *chan_id /* OUT */, void *u_data, uint32_t flags) struct svc_rqst_rec *sr_rec; uint32_t n_id; int code = 0; - work_pool_fun_t fun = svc_rqst_run_task; + work_pool_fun_t fun = NULL; mutex_lock(&svc_rqst_set.mtx); if (!svc_rqst_set.next_id) { @@ -1140,29 +1139,6 @@ static void svc_complete_task(struct svc_rqst_rec *sr_rec, bool finished) svc_rqst_release(sr_rec); } -/* - * No locking, "there can be only one" - */ -static void -svc_rqst_run_task(struct work_pool_entry *wpe) -{ - struct svc_rqst_rec *sr_rec = - opr_containerof(wpe, struct svc_rqst_rec, ev_wpe); - - /* enter event loop */ - switch (sr_rec->ev_type) { - default: - /* XXX formerly select/fd_set case, now placeholder for new - * event systems, reworked select, etc. */ - __warnx(TIRPC_DEBUG_FLAG_ERROR, - "%s: unsupported event type", - __func__); - break; - } /* switch */ - - svc_complete_task(sr_rec, true); -} - int svc_rqst_thrd_signal(uint32_t chan_id, uint32_t flags) { From 1dd5a369bd86c0f018d6602fb234126f8f1307ff Mon Sep 17 00:00:00 2001 From: "Frank S. Filz" Date: Tue, 6 Aug 2019 10:22:19 -0700 Subject: [PATCH 34/70] Implement nonblocking send - convert from writev to sendmsg The xdr_ioq keeps track of the progress of send and if any calls to sendmsg ever would have blocked. sendmsg will send as much data as it can without blocking, and then a subsequent call to send more data will likely return EAGAIN or EWOULDBLOCK to indicate it might block. A pointer to the duplex record is also kept for ease of finding it when the epoll event to unblock is triggered. Each duplex record now has two events, one for receive and one for send. It also has a pointer to the in progress send xdr_ioq. The SVCXPRT has a duplicate xp_fd_send to be used along with event_send to allow epoll for both receive and waiting for send to unblock. svc_ioq_flushv has the most changes to manage the conversion to sendmsg and dealing with partial write of the xdr_ioq. With the possibility of blocked send xdr_ioq, the writeq must be managed differently. Every xdr_ioq is queued into the writeq, but before doing so we remember if the writeq was empty so we can either immediately start working on the send (svc_ioq_write_now) or submit a work pool task (svc_ioq_write_submit). When the send unblocks, svc_rqst_xprt_task_send directly calls svc_ioq_write since the blocked send is at the head of the writeq. Signed-off-by: Daniel Gryniewicz Signed-off-by: Frank S. Filz --- ntirpc/lttng/xprt.h | 308 ++++++++++++++ ntirpc/rpc/svc.h | 8 +- ntirpc/rpc/xdr_ioq.h | 4 + src/rpc_dplx_internal.h | 8 +- src/svc_dg.c | 2 +- src/svc_internal.h | 19 +- src/svc_ioq.c | 432 +++++++++++++------ src/svc_ioq.h | 1 + src/svc_rqst.c | 890 +++++++++++++++++++++++++++++++--------- src/svc_vc.c | 82 +++- src/svc_xprt.c | 1 + 11 files changed, 1431 insertions(+), 324 deletions(-) diff --git a/ntirpc/lttng/xprt.h b/ntirpc/lttng/xprt.h index b5b62c0662..c6c0c87024 100644 --- a/ntirpc/lttng/xprt.h +++ b/ntirpc/lttng/xprt.h @@ -79,6 +79,314 @@ TRACEPOINT_LOGLEVEL( unref, TRACE_INFO) +TRACEPOINT_EVENT( + xprt, + destroy, + TP_ARGS(const char *, function, + unsigned int, line, + void *, xprt, + uint16_t, flags), + TP_FIELDS( + ctf_string(fnc, function) + ctf_integer(unsigned int, line, line) + ctf_integer_hex(void *, xprt, xprt) + ctf_integer(uint16_t, count, flags) + ) +) + +TRACEPOINT_LOGLEVEL( + xprt, + destroy, + TRACE_INFO) + +TRACEPOINT_EVENT( + xprt, + unhook, + TP_ARGS(const char *, function, + unsigned int, line, + void *, xprt, + uint32_t, flags), + TP_FIELDS( + ctf_string(fnc, function) + ctf_integer(unsigned int, line, line) + ctf_integer_hex(void *, xprt, xprt) + ctf_integer_hex(uint32_t, flags, flags) + ) +) + +TRACEPOINT_LOGLEVEL( + xprt, + unhook, + TRACE_INFO) + +TRACEPOINT_EVENT( + xprt, + rearm, + TP_ARGS(const char *, function, + unsigned int, line, + void *, xprt, + uint32_t, flags), + TP_FIELDS( + ctf_string(fnc, function) + ctf_integer(unsigned int, line, line) + ctf_integer_hex(void *, xprt, xprt) + ctf_integer_hex(uint32_t, flags, flags) + ) +) + +TRACEPOINT_LOGLEVEL( + xprt, + rearm, + TRACE_INFO) + +TRACEPOINT_EVENT( + xprt, + hook, + TP_ARGS(const char *, function, + unsigned int, line, + void *, xprt, + uint32_t, flags), + TP_FIELDS( + ctf_string(fnc, function) + ctf_integer(unsigned int, line, line) + ctf_integer_hex(void *, xprt, xprt) + ctf_integer_hex(uint32_t, flags, flags) + ) +) + +TRACEPOINT_LOGLEVEL( + xprt, + hook, + TRACE_INFO) + +TRACEPOINT_EVENT( + xprt, + event, + TP_ARGS(const char *, function, + unsigned int, line, + void *, xprt, + uint32_t, xp_flags, + uint32_t, ev_flag), + TP_FIELDS( + ctf_string(fnc, function) + ctf_integer(unsigned int, line, line) + ctf_integer_hex(void *, xprt, xprt) + ctf_integer_hex(uint32_t, xp_flags, xp_flags) + ctf_integer_hex(uint32_t, ev_flag, ev_flag) + ) +) + +TRACEPOINT_LOGLEVEL( + xprt, + event, + TRACE_INFO) + +TRACEPOINT_EVENT( + xprt, + recv, + TP_ARGS(const char *, function, + unsigned int, line, + void *, xprt, + unsigned int, destroyed, + unsigned int, count), + TP_FIELDS( + ctf_string(fnc, function) + ctf_integer(unsigned int, line, line) + ctf_integer_hex(void *, xprt, xprt) + ctf_integer(unsigned int, destroyed, destroyed) + ctf_integer(unsigned int, count, count) + ) +) + +TRACEPOINT_LOGLEVEL( + xprt, + recv, + TRACE_INFO) + +TRACEPOINT_EVENT( + xprt, + send, + TP_ARGS(const char *, function, + unsigned int, line, + void *, xprt, + unsigned int, destroyed, + unsigned int, count), + TP_FIELDS( + ctf_string(fnc, function) + ctf_integer(unsigned int, line, line) + ctf_integer_hex(void *, xprt, xprt) + ctf_integer(unsigned int, destroyed, destroyed) + ctf_integer(unsigned int, count, count) + ) +) + +TRACEPOINT_LOGLEVEL( + xprt, + send, + TRACE_INFO) + +TRACEPOINT_EVENT( + xprt, + write_blocked, + TP_ARGS(const char *, function, + unsigned int, line, + void *, xprt), + TP_FIELDS( + ctf_string(fnc, function) + ctf_integer(unsigned int, line, line) + ctf_integer_hex(void *, xprt, xprt) + ) +) + +TRACEPOINT_LOGLEVEL( + xprt, + write_blocked, + TRACE_INFO) + +TRACEPOINT_EVENT( + xprt, + write_complete, + TP_ARGS(const char *, function, + unsigned int, line, + void *, xprt, + unsigned int, has_blocked), + TP_FIELDS( + ctf_string(fnc, function) + ctf_integer(unsigned int, line, line) + ctf_integer_hex(void *, xprt, xprt) + ctf_integer(unsigned int, has_blocked, has_blocked) + ) +) + +TRACEPOINT_LOGLEVEL( + xprt, + write_complete, + TRACE_INFO) + +TRACEPOINT_EVENT( + xprt, + sendmsg, + TP_ARGS(const char *, function, + unsigned int, line, + void *, xprt, + unsigned int, remaining, + unsigned int, frag_needed, + unsigned int, iov_count), + TP_FIELDS( + ctf_string(fnc, function) + ctf_integer(unsigned int, line, line) + ctf_integer_hex(void *, xprt, xprt) + ctf_integer(unsigned int, remaining, remaining) + ctf_integer(unsigned int, frag_needed, frag_needed) + ctf_integer(unsigned int, iov_count, iov_count) + ) +) + +TRACEPOINT_LOGLEVEL( + xprt, + sendmsg, + TRACE_INFO) + +TRACEPOINT_EVENT( + xprt, + mutex, + TP_ARGS(const char *, function, + unsigned int, line, + void *, xprt), + TP_FIELDS( + ctf_string(fnc, function) + ctf_integer(unsigned int, line, line) + ctf_integer_hex(void *, xprt, xprt) + ) +) + +TRACEPOINT_LOGLEVEL( + xprt, + mutex, + TRACE_INFO) + +TRACEPOINT_EVENT( + xprt, + funcin, + TP_ARGS(const char *, function, + unsigned int, line, + void *, xprt), + TP_FIELDS( + ctf_string(fnc, function) + ctf_integer(unsigned int, line, line) + ctf_integer_hex(void *, xprt, xprt) + ) +) + +TRACEPOINT_LOGLEVEL( + xprt, + funcin, + TRACE_INFO) + +TRACEPOINT_EVENT( + xprt, + recv_frag, + TP_ARGS(const char *, function, + unsigned int, line, + void *, xprt, + int32_t, frag_len), + TP_FIELDS( + ctf_string(fnc, function) + ctf_integer(unsigned int, line, line) + ctf_integer_hex(void *, xprt, xprt) + ctf_integer_hex(int32_t, frag_len, frag_len) + ) +) + +TRACEPOINT_LOGLEVEL( + xprt, + recv_frag, + TRACE_INFO) + +TRACEPOINT_EVENT( + xprt, + recv_bytes, + TP_ARGS(const char *, function, + unsigned int, line, + void *, xprt, + int32_t, frag_remain, + ssize_t, bytes), + TP_FIELDS( + ctf_string(fnc, function) + ctf_integer(unsigned int, line, line) + ctf_integer_hex(void *, xprt, xprt) + ctf_integer_hex(int32_t, frag_remain, frag_remain) + ctf_integer_hex(ssize_t, bytes, bytes) + ) +) + +TRACEPOINT_LOGLEVEL( + xprt, + recv_bytes, + TRACE_INFO) + +TRACEPOINT_EVENT( + xprt, + recv_exit, + TP_ARGS(const char *, function, + unsigned int, line, + void *, xprt, + const char *, reason, + int, code), + TP_FIELDS( + ctf_string(fnc, function) + ctf_integer(unsigned int, line, line) + ctf_integer_hex(void *, xprt, xprt) + ctf_string(reason, reason) + ctf_integer(int, code, code) + ) +) + +TRACEPOINT_LOGLEVEL( + xprt, + recv_exit, + TRACE_INFO) + #endif /* GANESHA_LTTNG_XPRT_TP_H */ #undef TRACEPOINT_INCLUDE diff --git a/ntirpc/rpc/svc.h b/ntirpc/rpc/svc.h index 169fe6b0f0..d3eded4315 100644 --- a/ntirpc/rpc/svc.h +++ b/ntirpc/rpc/svc.h @@ -151,7 +151,8 @@ typedef struct svc_init_params { #define SVC_XPRT_FLAG_NONE 0x0000 /* uint16_t actually used */ -#define SVC_XPRT_FLAG_ADDED 0x0001 +#define SVC_XPRT_FLAG_ADDED_RECV 0x0001 +#define SVC_XPRT_FLAG_ADDED_SEND 0x0002 #define SVC_XPRT_FLAG_INITIAL 0x0004 #define SVC_XPRT_FLAG_INITIALIZED 0x0008 @@ -265,6 +266,7 @@ struct svc_xprt { mutex_t xp_lock; int xp_fd; + int xp_fd_send; /* Sometimes a dup of xp_fd needed for send */ int xp_ifindex; /* interface index */ int xp_si_type; /* si type */ int xp_type; /* xprt type */ @@ -456,6 +458,10 @@ static inline void svc_destroy_it(SVCXPRT *xprt, XPRT_TRACE(xprt, __func__, tag, line); +#ifdef USE_LTTNG_NTIRPC + tracepoint(xprt, destroy, tag, line, xprt, flags); +#endif /* USE_LTTNG_NTIRPC */ + if (flags & SVC_XPRT_FLAG_DESTROYING) { /* previously set, do nothing */ return; diff --git a/ntirpc/rpc/xdr_ioq.h b/ntirpc/rpc/xdr_ioq.h index 8effc8d786..9d46e32373 100644 --- a/ntirpc/rpc/xdr_ioq.h +++ b/ntirpc/rpc/xdr_ioq.h @@ -89,6 +89,10 @@ struct xdr_ioq { struct xdr_ioq_uv_head ioq_uv; /* header/vectors */ uint64_t id; + uint32_t write_start; /* Position to start write at */ + int frag_hdr_bytes_sent; /* Indicates a fragment header has been sent */ + bool has_blocked; + struct rpc_dplx_rec *rec; }; #define _IOQ(p) (opr_containerof((p), struct xdr_ioq, ioq_s)) diff --git a/src/rpc_dplx_internal.h b/src/rpc_dplx_internal.h index b21c848a5e..ef3dd2985b 100644 --- a/src/rpc_dplx_internal.h +++ b/src/rpc_dplx_internal.h @@ -48,6 +48,8 @@ typedef struct rpc_dplx_lock { } locktrace; } rpc_dplx_lock_t; +struct svc_rqst_rec; + /* new unified state */ struct rpc_dplx_rec { struct svc_xprt xprt; /**< Transport Independent handle */ @@ -66,11 +68,13 @@ struct rpc_dplx_rec { union { #if defined(TIRPC_EPOLL) struct { - struct epoll_event event; + struct epoll_event event_recv; + struct epoll_event event_send; + struct xdr_ioq *xioq_send; } epoll; #endif } ev_u; - void *ev_p; /* struct svc_rqst_rec (internal) */ + struct svc_rqst_rec *ev_p; /* struct svc_rqst_rec (internal) */ size_t maxrec; long pagesz; diff --git a/src/svc_dg.c b/src/svc_dg.c index 7e13d57f6b..156d5ff5f7 100644 --- a/src/svc_dg.c +++ b/src/svc_dg.c @@ -271,7 +271,7 @@ svc_dg_rendezvous(SVCXPRT *xprt) return (XPRT_DIED); } - if (unlikely(svc_rqst_rearm_events(xprt))) { + if (unlikely(svc_rqst_rearm_events(xprt, SVC_XPRT_FLAG_ADDED_RECV))) { __warnx(TIRPC_DEBUG_FLAG_ERROR, "%s: %p fd %d svc_rqst_rearm_events failed (will set dead)", __func__, xprt, xprt->xp_fd); diff --git a/src/svc_internal.h b/src/svc_internal.h index 2d9b63e947..c51e437fc8 100644 --- a/src/svc_internal.h +++ b/src/svc_internal.h @@ -152,8 +152,25 @@ svc_override_ops(struct xp_ops *ops, SVCXPRT *rendezvous) } /* in svc_rqst.c */ -int svc_rqst_rearm_events(SVCXPRT *); +int svc_rqst_rearm_events_locked(SVCXPRT *, uint16_t); + +static inline int svc_rqst_rearm_events(SVCXPRT *xprt, uint16_t ev_flags) +{ + struct rpc_dplx_rec *rec = REC_XPRT(xprt); + int code; + + rpc_dplx_rli(rec); + + code = svc_rqst_rearm_events_locked(xprt, ev_flags); + + rpc_dplx_rui(rec); + + return code; +} + int svc_rqst_xprt_register(SVCXPRT *, SVCXPRT *); void svc_rqst_xprt_unregister(SVCXPRT *, uint32_t); +int svc_rqst_evchan_write(SVCXPRT *, struct xdr_ioq *, bool); +void svc_rqst_xprt_send_complete(SVCXPRT *); #endif /* TIRPC_SVC_INTERNAL_H */ diff --git a/src/svc_ioq.c b/src/svc_ioq.c index 059ec19172..e06fb0d68a 100644 --- a/src/svc_ioq.c +++ b/src/svc_ioq.c @@ -67,119 +67,268 @@ #include "svc_ioq.h" #define LAST_FRAG ((u_int32_t)(1 << 31)) +#define LAST_FRAG_XDR_UNITS ((LAST_FRAG - 1) & ~(BYTES_PER_XDR_UNIT - 1)) #define MAXALLOCA (256) static inline int svc_ioq_flushv(SVCXPRT *xprt, struct xdr_ioq *xioq) { - struct iovec *iov, *tiov, *wiov; - struct poolq_entry *have; - struct xdr_ioq_uv *data; + struct msghdr msg; + struct iovec *iov; + struct xdr_vio *vio; ssize_t result; u_int32_t frag_header; u_int32_t fbytes; - u_int32_t remaining = 0; - u_int32_t vsize = (xioq->ioq_uv.uvqh.qcount + 1) * sizeof(struct iovec); - int iw = 0; - int ix = 1; - int rc = 0; + int error = 0; + int frag_needed = 0; + u_int32_t last_frag = 0; + u_int32_t end, remaining, iov_count, vsize, isize; + + /* update the most recent data length, just in case */ + xdr_tail_update(xioq->xdrs); + + /* Some basic computations */ + end = XDR_GETPOS(xioq->xdrs); + remaining = end - xioq->write_start; + iov_count = XDR_IOVCOUNT(xioq->xdrs, xioq->write_start, remaining); + vsize = (iov_count + 1) * sizeof(struct iovec); + isize = iov_count * sizeof(struct xdr_vio); + + __warnx(TIRPC_DEBUG_FLAG_SVC_VC, + "-------> %s: remaining %"PRIu32" write_start %"PRIu32 + " end %"PRIu32, + __func__, remaining, xioq->write_start, end); + + memset(&msg, 0, sizeof(msg)); + + if (end > (2 * LAST_FRAG_XDR_UNITS)) { + /* This data will need to be 3 fragments */ + if (xioq->write_start < LAST_FRAG_XDR_UNITS) { + fbytes = LAST_FRAG_XDR_UNITS - xioq->write_start; + } else if (xioq->write_start < (2 * LAST_FRAG_XDR_UNITS)) { + fbytes = (2 * LAST_FRAG_XDR_UNITS) - xioq->write_start; + } else { + fbytes = end - xioq->write_start; + last_frag = LAST_FRAG; + } + } else if (end > LAST_FRAG_XDR_UNITS) { + /* This data will need to be 2 fragments */ + if (xioq->write_start < LAST_FRAG_XDR_UNITS) { + fbytes = LAST_FRAG_XDR_UNITS - xioq->write_start; + } else { + fbytes = end - xioq->write_start; + last_frag = LAST_FRAG; + } + } else { + fbytes = remaining; + last_frag = LAST_FRAG; + } if (unlikely(vsize > MAXALLOCA)) { iov = mem_alloc(vsize); } else { iov = alloca(vsize); } - wiov = iov; /* position at initial fragment header */ - - /* update the most recent data length, just in case */ - xdr_tail_update(xioq->xdrs); - /* build list after initial fragment header (ix = 1 above) */ - TAILQ_FOREACH(have, &(xioq->ioq_uv.uvqh.qh), q) { - data = IOQ_(have); - tiov = iov + ix; - tiov->iov_base = data->v.vio_head; - tiov->iov_len = ioquv_length(data); - remaining += tiov->iov_len; - ix++; + if (unlikely(isize > MAXALLOCA)) { + vio = mem_alloc(isize); + } else { + vio = alloca(isize); } while (remaining > 0) { - if (iw == 0) { - /* new fragment header, determine last iov */ - fbytes = 0; - for (tiov = &wiov[++iw]; - (tiov < &iov[ix]) && (iw < __svc_maxiov); - ++tiov, ++iw) { - fbytes += tiov->iov_len; - - /* check for fragment value overflow */ - /* never happens, see ganesha FSAL_MAXIOSIZE */ - if (unlikely(fbytes >= LAST_FRAG)) { - fbytes -= tiov->iov_len; - break; - } - } /* for */ - - /* fragment length doesn't include fragment header */ - if (&wiov[iw] < &iov[ix]) { - frag_header = htonl((u_int32_t) (fbytes)); - } else { - frag_header = htonl((u_int32_t) (fbytes | LAST_FRAG)); - } - wiov->iov_base = &(frag_header); - wiov->iov_len = sizeof(u_int32_t); + int i; + int frag_hdr_size = 0; + + /* Note that there may be lots of re-walking the ioq to + * count the number of buffers or fill the buffers in the vio, + * unfortunately, any mechanism to try and avoid that would + * still have to re-walk the ioq, so we don't save THAT much + * by just recomputing in preparation for each attempt to send + * data. We could shortcut a little bit if we could estimate + * how many bytes would fit in a single iovec so that we + * don't walk more of the ioq than we need to. But that adds a + * lot of complexity, and just saves walking a linked list. + * + * A more relevant improvement here might actually be to use + * larger buffers than 8k. Optionally, when we do more to + * implement zero copy, the largest responses which are + * READ and READDIR will be adding a single buffer, or a small + * number of buffers to the ioq instead of copying into the + * 8k byte buffers. + */ + iov_count = XDR_IOVCOUNT(xioq->xdrs, xioq->write_start, fbytes); + + if (xioq->write_start == 0 || + xioq->write_start == LAST_FRAG_XDR_UNITS || + xioq->write_start == (2 * LAST_FRAG_XDR_UNITS)) { + /* We need a fragment header, or to complete it. Look + * at xioq->frag_hdr_bytes_sent to know how many bytes + * of it we have sent so far. + */ + frag_needed = 1; + frag_header = htonl((u_int32_t) (fbytes | last_frag)); + iov[0].iov_base = ((char *) &frag_header) + + xioq->frag_hdr_bytes_sent; + iov[0].iov_len = sizeof(frag_header) - + xioq->frag_hdr_bytes_sent; + frag_hdr_size = iov[0].iov_len; + __warnx(TIRPC_DEBUG_FLAG_SVC_VC, + "%s: %p fd %d iov[0].vio_head %p vio_length %z", + __func__, xprt, xprt->xp_fd, + iov[0].iov_base, iov[0].iov_len); + } - /* writev return includes fragment header */ - remaining += sizeof(u_int32_t); - fbytes += sizeof(u_int32_t); + __warnx(TIRPC_DEBUG_FLAG_SVC_VC, + "%s: %p fd %d msg_iov %p remaining %"PRIu32 + " fbytes %"PRIu32" iov_count %"PRIu32 + " write_start %"PRIu32" end %"PRIu32 + " frag_needed %d frag_hdr_size %d", + __func__, xprt, xprt->xp_fd, msg.msg_iov, + remaining, fbytes, iov_count, + xioq->write_start, end, frag_needed, frag_hdr_size); + + /* Get an xdr_vio corresponding to the bytes of this fragment */ + if (!XDR_FILLBUFS(xioq->xdrs, xioq->write_start, vio, fbytes)) { + __warnx(TIRPC_DEBUG_FLAG_ERROR, + "%s() XDR_FILLBUFS failed", __func__); + SVC_DESTROY(xprt); + break; } - /* blocking write */ - result = writev(xprt->xp_fd, wiov, iw); - remaining -= result; + if (iov_count + frag_needed > UIO_MAXIOV) { + /* sendmsg can only take UIO_MAXIOV iovecs */ + iov_count = UIO_MAXIOV - frag_needed; + } - if (result == fbytes) { - wiov += iw - 1; - iw = 0; - continue; + /* Convert the xdr_vio to an iovec */ + for (i = 0; i < iov_count; i++) { + iov[i + frag_needed].iov_base = vio[i].vio_head; + iov[i + frag_needed].iov_len = vio[i].vio_length; + __warnx(TIRPC_DEBUG_FLAG_SVC_VC, + "%s: %p fd %d iov[%d].vio_head %p vio_length %z", + __func__, xprt, xprt->xp_fd, i + frag_needed, + iov[i + frag_needed].iov_base, + iov[i + frag_needed].iov_len); } + + msg.msg_iov = iov; + msg.msg_iovlen = iov_count + frag_needed; + +again: + +#ifdef USE_LTTNG_NTIRPC + tracepoint(xprt, sendmsg, __func__, __LINE__, + xprt, + (unsigned int) remaining, + (unsigned int) frag_needed, + (unsigned int) iov_count); +#endif /* USE_LTTNG_NTIRPC */ + + /* non-blocking write */ + errno = 0; + result = sendmsg(xprt->xp_fd, &msg, MSG_DONTWAIT); + error = errno; + + __warnx((error == EWOULDBLOCK || error == EAGAIN || error == 0) + ? TIRPC_DEBUG_FLAG_SVC_VC + : TIRPC_DEBUG_FLAG_ERROR, + "%s: %p fd %d msg_iov %p sendmsg remaining %" + PRIu32" result %ld error %s (%d)", + __func__, xprt, xprt->xp_fd, msg.msg_iov, + remaining, (long int) result, + strerror(error), error); + if (unlikely(result < 0)) { - __warnx(TIRPC_DEBUG_FLAG_ERROR, - "%s() writev failed (%d)\n", - __func__, errno); - rc = -1; + if (error == EWOULDBLOCK || error == EAGAIN) { + /* Socket buffer full; don't destroy */ + error = EWOULDBLOCK; + xioq->has_blocked = true; + } break; } + + if (result < frag_hdr_size) { + /* We had a fragment headerr and didn't manage to send + * the entire thing... + */ + xioq->frag_hdr_bytes_sent += result; + iov[0].iov_base += result; + iov[0].iov_len -= result; + __warnx(TIRPC_DEBUG_FLAG_SVC_VC, + "%s: %p fd %d iov[0].vio_head %p vio_length %z", + __func__, xprt, xprt->xp_fd, + iov[0].iov_base, iov[0].iov_len); + /* Shortcut because we don't need to recompute the + * iovec. + */ + goto again; + } + + /* At this point, the frag header must have been fully sent, + * go ahead and indicate that... Also deduct any fragment + * header bytes from result. + */ + xioq->frag_hdr_bytes_sent = sizeof(frag_header); + result -= frag_hdr_size; + frag_hdr_size = 0; + + /* Keep track of progress */ + remaining -= result; fbytes -= result; - /* rare? writev underrun? (assume never overrun) */ - for (tiov = wiov; iw > 0; ++tiov, --iw) { - if (tiov->iov_len > result) { - tiov->iov_len -= result; - tiov->iov_base += result; - wiov = tiov; - break; + /* Keep track of progress in the xioq */ + xioq->write_start += result; + + if (fbytes == 0) { + /* We completed sending a fragment. */ + xioq->frag_hdr_bytes_sent = 0; + if (remaining > LAST_FRAG_XDR_UNITS) { + fbytes = LAST_FRAG_XDR_UNITS; } else { - result -= tiov->iov_len; + fbytes = remaining; } - } /* for */ + frag_needed = 1; + } else { + frag_needed = 0; + } } /* while */ - if (unlikely(vsize > MAXALLOCA)) { + if (unlikely(vsize > MAXALLOCA)) mem_free(iov, vsize); - } - return rc; + if (unlikely(isize > MAXALLOCA)) + mem_free(vio, isize); + + __warnx(TIRPC_DEBUG_FLAG_SVC_VC, + "%s: %p fd %d returning %s (%d)", + __func__, xprt, xprt->xp_fd, strerror(error), error); + + return error; } -static void -svc_ioq_write(SVCXPRT *xprt, struct xdr_ioq *xioq, struct poolq_head *ifph) +void svc_ioq_write(SVCXPRT *xprt) { + struct rpc_dplx_rec *rec = REC_XPRT(xprt); + struct xdr_ioq *xioq; struct poolq_entry *have; - for (;;) { +#ifdef USE_LTTNG_NTIRPC + tracepoint(xprt, mutex, __func__, __LINE__, xprt); +#endif /* USE_LTTNG_NTIRPC */ + mutex_lock(&rec->writeq.qmutex); + + have = TAILQ_FIRST(&rec->writeq.qh); + + while (have != NULL) { int rc = 0; + /* Process the xioq from the head of the xprt queue */ + mutex_unlock(&rec->writeq.qmutex); + + xioq = _IOQ(have); + + /* Save has blocked before state */ + bool has_blocked = xioq->has_blocked; /* do i/o unlocked */ if (svc_work_pool.params.thrd_max @@ -188,62 +337,103 @@ svc_ioq_write(SVCXPRT *xprt, struct xdr_ioq *xioq, struct poolq_head *ifph) rc = svc_ioq_flushv(xprt, xioq); } - if (rc < 0) { - /* IO failed, destroy rather than releasing */ - SVC_DESTROY(xprt); - } else { - SVC_RELEASE(xprt, SVC_RELEASE_FLAG_NONE); - } + if (rc != EWOULDBLOCK) { + if (rc < 0) { + /* IO failed, destroy rather than releasing */ + __warnx(TIRPC_DEBUG_FLAG_SVC_VC, + "%s: %p fd %d About to destroy - rc = %d", + __func__, xprt, xprt->xp_fd, rc); + SVC_DESTROY(xprt); + } else { + __warnx(TIRPC_DEBUG_FLAG_SVC_VC, + "%s: %p fd %d About to release", + __func__, xprt, xprt->xp_fd); + SVC_RELEASE(xprt, SVC_RELEASE_FLAG_NONE); + } - XDR_DESTROY(xioq->xdrs); + XDR_DESTROY(xioq->xdrs); + } - mutex_lock(&ifph->qmutex); - if (--(ifph->qcount) == 0) +#ifdef USE_LTTNG_NTIRPC + tracepoint(xprt, mutex, __func__, __LINE__, &rec->xprt); +#endif /* USE_LTTNG_NTIRPC */ + mutex_lock(&rec->writeq.qmutex); + + if (rc == EWOULDBLOCK) { + __warnx(TIRPC_DEBUG_FLAG_SVC_VC, + "%s: %p fd %d EWOULDBLOCK", + __func__, xprt, xprt->xp_fd); + /* Add to epoll and stop processing this xprt's queue */ +#ifdef USE_LTTNG_NTIRPC + tracepoint(xprt, write_blocked, __func__, __LINE__, + &rec->xprt); +#endif /* USE_LTTNG_NTIRPC */ + svc_rqst_evchan_write(xprt, xioq, has_blocked); break; + } else if (xioq->has_blocked) { + __warnx(TIRPC_DEBUG_FLAG_SVC_VC, + "%s: %p fd %d COMPLETED AFTER BLOCKING", + __func__, xprt, xprt->xp_fd); +#ifdef USE_LTTNG_NTIRPC + tracepoint(xprt, write_complete, __func__, __LINE__, + &rec->xprt, (int) xioq->has_blocked); +#endif /* USE_LTTNG_NTIRPC */ + svc_rqst_xprt_send_complete(xprt); + } else { + __warnx(TIRPC_DEBUG_FLAG_SVC_VC, + "%s: %p fd %d COMPLETED", + __func__, xprt, xprt->xp_fd); +#ifdef USE_LTTNG_NTIRPC + tracepoint(xprt, write_complete, __func__, __LINE__, + &rec->xprt, (int) xioq->has_blocked); +#endif /* USE_LTTNG_NTIRPC */ + } - /* Grab next one */ - have = TAILQ_FIRST(&ifph->qh); - TAILQ_REMOVE(&ifph->qh, have, q); - mutex_unlock(&ifph->qmutex); + /* Dequeue the completed request */ + TAILQ_REMOVE(&rec->writeq.qh, have, q); - xioq = _IOQ(have); - xprt = (SVCXPRT *)xioq->xdrs[0].x_lib[1]; + /* Fetch the next request */ + have = TAILQ_FIRST(&rec->writeq.qh); } - mutex_unlock(&ifph->qmutex); + + mutex_unlock(&rec->writeq.qmutex); } static void svc_ioq_write_callback(struct work_pool_entry *wpe) { struct xdr_ioq *xioq = opr_containerof(wpe, struct xdr_ioq, ioq_wpe); - SVCXPRT *xprt = (SVCXPRT *)xioq->xdrs[0].x_lib[1]; - struct rpc_dplx_rec *rec = REC_XPRT(xprt); - struct poolq_head *ifph = &rec->writeq; - svc_ioq_write(xprt, xioq, ifph); + svc_ioq_write(xioq->xdrs[0].x_lib[1]); } void svc_ioq_write_now(SVCXPRT *xprt, struct xdr_ioq *xioq) { struct rpc_dplx_rec *rec = REC_XPRT(xprt); - struct poolq_head *ifph = &rec->writeq; + bool was_empty; SVC_REF(xprt, SVC_REF_FLAG_NONE); - mutex_lock(&ifph->qmutex); - if ((ifph->qcount)++ > 0) { - /* queue additional output requests without task switch */ - TAILQ_INSERT_TAIL(&ifph->qh, &(xioq->ioq_s), q); - mutex_unlock(&ifph->qmutex); - return; - } - mutex_unlock(&ifph->qmutex); +#ifdef USE_LTTNG_NTIRPC + tracepoint(xprt, mutex, __func__, __LINE__, &rec->xprt); +#endif /* USE_LTTNG_NTIRPC */ + mutex_lock(&rec->writeq.qmutex); + + was_empty = TAILQ_FIRST(&rec->writeq.qh) == NULL; + + /* always queue output requests on the duplex record's writeq */ + TAILQ_INSERT_TAIL(&rec->writeq.qh, &(xioq->ioq_s), q); + + mutex_unlock(&rec->writeq.qmutex); - /* handle this output request without queuing, then any additional - * output requests without a task switch (using this thread). - */ - svc_ioq_write(xprt, xioq, ifph); + if (was_empty) { + /* handle this output request without queuing, then any + * additional output requests without a task switch (using this + * thread). + */ + svc_ioq_write(xprt); + } } /* @@ -257,21 +447,25 @@ void svc_ioq_write_submit(SVCXPRT *xprt, struct xdr_ioq *xioq) { struct rpc_dplx_rec *rec = REC_XPRT(xprt); - struct poolq_head *ifph = &rec->writeq; + bool was_empty; SVC_REF(xprt, SVC_REF_FLAG_NONE); - mutex_lock(&ifph->qmutex); - if ((ifph->qcount)++ > 0) { - /* queue additional output requests, they will be handled by - * existing thread without another task switch. - */ - TAILQ_INSERT_TAIL(&ifph->qh, &(xioq->ioq_s), q); - mutex_unlock(&ifph->qmutex); - return; - } - mutex_unlock(&ifph->qmutex); +#ifdef USE_LTTNG_NTIRPC + tracepoint(xprt, mutex, __func__, __LINE__, &xprt); +#endif /* USE_LTTNG_NTIRPC */ + mutex_lock(&rec->writeq.qmutex); - xioq->ioq_wpe.fun = svc_ioq_write_callback; - work_pool_submit(&svc_work_pool, &xioq->ioq_wpe); + was_empty = TAILQ_FIRST(&rec->writeq.qh) == NULL; + + /* always queue output requests on the duplex record's writeq */ + TAILQ_INSERT_TAIL(&rec->writeq.qh, &(xioq->ioq_s), q); + + mutex_unlock(&rec->writeq.qmutex); + + if (was_empty) { + /* Schedule work to process output for this duplex record. */ + xioq->ioq_wpe.fun = svc_ioq_write_callback; + work_pool_submit(&svc_work_pool, &xioq->ioq_wpe); + } } diff --git a/src/svc_ioq.h b/src/svc_ioq.h index 550d3f7689..2587d6c3bb 100644 --- a/src/svc_ioq.h +++ b/src/svc_ioq.h @@ -29,6 +29,7 @@ #include #include +void svc_ioq_write(SVCXPRT *); void svc_ioq_write_now(SVCXPRT *, struct xdr_ioq *); void svc_ioq_write_submit(SVCXPRT *, struct xdr_ioq *); diff --git a/src/svc_rqst.c b/src/svc_rqst.c index 49f314f885..e35c630ee0 100644 --- a/src/svc_rqst.c +++ b/src/svc_rqst.c @@ -50,6 +50,7 @@ #include "svc_internal.h" #include "svc_xprt.h" #include +#include "svc_ioq.h" /** * @file svc_rqst.c @@ -94,6 +95,7 @@ struct svc_rqst_rec { struct epoll_event ctrl_ev; struct epoll_event *events; u_int max_events; /* max epoll events */ + bool sv1_added; } epoll; #endif struct { @@ -103,8 +105,69 @@ struct svc_rqst_rec { int32_t ev_refcnt; uint16_t ev_flags; + struct xdr_ioq *xioq; /* IOQ for floating sr_rec */ }; +void svc_rqst_rec_init(struct svc_rqst_rec *sr_rec) +{ + /* Pre-initialize stuff that needs to be non-zero */ + mutex_init(&sr_rec->ev_lock, NULL); + sr_rec->sv[0] = -1; + sr_rec->sv[1] = -1; + sr_rec->id_k = UINT32_MAX; +#if defined(TIRPC_EPOLL) + sr_rec->ev_u.epoll.epoll_fd = -1; +#endif +} + +void svc_rqst_rec_destroy(struct svc_rqst_rec *sr_rec) +{ +#if defined(TIRPC_EPOLL) + if (sr_rec->ev_u.epoll.sv1_added) { + int code; + + code = epoll_ctl(sr_rec->ev_u.epoll.epoll_fd, EPOLL_CTL_DEL, + sr_rec->sv[1], &sr_rec->ev_u.epoll.ctrl_ev); + if (code) { + code = errno; + __warnx(TIRPC_DEBUG_FLAG_WARN, + "%s: sr_rec %p evchan %d ev_refcnt %" PRId32 + " epoll_fd %d control fd pair (%d:%d) unhook failed (%d)", + __func__, + sr_rec, sr_rec->id_k, sr_rec->ev_refcnt, + sr_rec->ev_u.epoll.epoll_fd, + sr_rec->sv[0], sr_rec->sv[1], code); + } else { + __warnx(TIRPC_DEBUG_FLAG_SVC_RQST | + TIRPC_DEBUG_FLAG_REFCNT, + "%s: sr_rec %p evchan %d ev_refcnt %" PRId32 + " epoll_fd %d control fd pair (%d:%d) unhook event %p", + __func__, + sr_rec, sr_rec->id_k, sr_rec->ev_refcnt, + sr_rec->ev_u.epoll.epoll_fd, + sr_rec->sv[0], sr_rec->sv[1], &sr_rec->ev_u.epoll.ctrl_ev); + } + } +#endif + + if (sr_rec->sv[0] >= 0) { + close(sr_rec->sv[0]); + sr_rec->sv[0] = -1; + } + + if (sr_rec->sv[1] >= 0) { + close(sr_rec->sv[1]); + sr_rec->sv[1] = -1; + } + +#if defined(TIRPC_EPOLL) + if (sr_rec->ev_u.epoll.epoll_fd > 0) { + close(sr_rec->ev_u.epoll.epoll_fd); + sr_rec->ev_u.epoll.epoll_fd = -1; + } +#endif +} + struct svc_rqst_set { mutex_t mtx; struct svc_rqst_rec *srr; @@ -161,6 +224,8 @@ SetNonBlock(int fd) void svc_rqst_init(uint32_t channels) { + int i; + mutex_lock(&svc_rqst_set.mtx); if (svc_rqst_set.srr) @@ -170,6 +235,10 @@ svc_rqst_init(uint32_t channels) svc_rqst_set.next_id = channels; svc_rqst_set.srr = mem_zalloc(channels * sizeof(struct svc_rqst_rec)); + for (i = 0; i < channels; i++) { + svc_rqst_rec_init(&svc_rqst_set.srr[i]); + } + unlock: mutex_unlock(&svc_rqst_set.mtx); } @@ -232,7 +301,7 @@ void svc_rqst_expire_insert(struct clnt_req *cc) { struct cx_data *cx = CX_DATA(cc->cc_clnt); - struct svc_rqst_rec *sr_rec = (struct svc_rqst_rec *)cx->cx_rec->ev_p; + struct svc_rqst_rec *sr_rec = cx->cx_rec->ev_p; struct opr_rbtree_node *nv; cc->cc_expire_ms = svc_rqst_expire_ms(&cc->cc_timeout); @@ -248,6 +317,10 @@ svc_rqst_expire_insert(struct clnt_req *cc) } mutex_unlock(&sr_rec->ev_lock); + __warnx(TIRPC_DEBUG_FLAG_SVC_RQST, + "%s: sv[0] fd %d before ev_sig (sr_rec %p)", + __func__, sr_rec->sv[0], + sr_rec); ev_sig(sr_rec->sv[0], 0); /* send wakeup */ } @@ -261,6 +334,10 @@ svc_rqst_expire_remove(struct clnt_req *cc) opr_rbtree_remove(&sr_rec->call_expires, &cc->cc_rqst); mutex_unlock(&sr_rec->ev_lock); + __warnx(TIRPC_DEBUG_FLAG_SVC_RQST, + "%s: sv[0] fd %d before ev_sig (sr_rec %p)", + __func__, sr_rec->sv[0], + sr_rec); ev_sig(sr_rec->sv[0], 0); /* send wakeup */ } @@ -283,13 +360,28 @@ svc_rqst_expire_task(struct work_pool_entry *wpe) clnt_req_release(cc); } +static inline void +svc_rqst_release(struct svc_rqst_rec *sr_rec) +{ + if (atomic_dec_int32_t(&sr_rec->ev_refcnt) > 0) + return; + + __warnx(TIRPC_DEBUG_FLAG_SVC_RQST, + "%s: remove evchan %d control fd pair (%d:%d)", + __func__, sr_rec->id_k, + sr_rec->sv[0], sr_rec->sv[1]); + + svc_rqst_rec_destroy(sr_rec); +} + int svc_rqst_new_evchan(uint32_t *chan_id /* OUT */, void *u_data, uint32_t flags) { struct svc_rqst_rec *sr_rec; uint32_t n_id; - int code = 0; + int code = 0, i; work_pool_fun_t fun = NULL; + int32_t ref_rec; mutex_lock(&svc_rqst_set.mtx); if (!svc_rqst_set.next_id) { @@ -300,26 +392,31 @@ svc_rqst_new_evchan(uint32_t *chan_id /* OUT */, void *u_data, uint32_t flags) return (0); } n_id = --(svc_rqst_set.next_id); + sr_rec = &svc_rqst_set.srr[n_id]; - if (atomic_postinc_int32_t(&sr_rec->ev_refcnt) > 0) { + ref_rec = atomic_postinc_int32_t(&sr_rec->ev_refcnt); + + if (ref_rec > 0) { /* already exists */ *chan_id = n_id; mutex_unlock(&svc_rqst_set.mtx); return (0); } + /* Track the references we have */ + ref_rec++; + flags |= SVC_RQST_FLAG_EPOLL; /* XXX */ /* create a pair of anonymous sockets for async event channel wakeups */ code = socketpair(AF_UNIX, SOCK_STREAM, 0, sr_rec->sv); if (code) { + code = errno; __warnx(TIRPC_DEBUG_FLAG_ERROR, - "%s: failed creating event signal socketpair (%d)", + "%s: failed creating event signal socketpair (%d) for sr_rec", __func__, code); - ++(svc_rqst_set.next_id); - mutex_unlock(&svc_rqst_set.mtx); - return (code); + goto fail; } /* set non-blocking */ @@ -350,25 +447,33 @@ svc_rqst_new_evchan(uint32_t *chan_id /* OUT */, void *u_data, uint32_t flags) mem_free(sr_rec->ev_u.epoll.events, sr_rec->ev_u.epoll.max_events * sizeof(struct epoll_event)); - ++(svc_rqst_set.next_id); - mutex_unlock(&svc_rqst_set.mtx); - return (EINVAL); + code = EINVAL; + goto fail; } /* permit wakeup of threads blocked in epoll_wait, with a * couple of possible semantics */ - sr_rec->ev_u.epoll.ctrl_ev.events = - EPOLLIN | EPOLLRDHUP; + sr_rec->ev_u.epoll.ctrl_ev.events = EPOLLIN | EPOLLRDHUP; sr_rec->ev_u.epoll.ctrl_ev.data.fd = sr_rec->sv[1]; - code = - epoll_ctl(sr_rec->ev_u.epoll.epoll_fd, EPOLL_CTL_ADD, - sr_rec->sv[1], &sr_rec->ev_u.epoll.ctrl_ev); + code = epoll_ctl(sr_rec->ev_u.epoll.epoll_fd, EPOLL_CTL_ADD, + sr_rec->sv[1], &sr_rec->ev_u.epoll.ctrl_ev); if (code == -1) { code = errno; __warnx(TIRPC_DEBUG_FLAG_ERROR, "%s: add control socket failed (%d)", __func__, code); + goto fail; } + + sr_rec->ev_u.epoll.sv1_added = true; + + __warnx(TIRPC_DEBUG_FLAG_SVC_RQST | TIRPC_DEBUG_FLAG_REFCNT, + "%s: sr_rec %p evchan %d ev_refcnt %" PRId32 + " epoll_fd %d EPOLL_CTL_ADD code %d event %p", + __func__, + sr_rec, sr_rec->id_k, ref_rec, + sr_rec->ev_u.epoll.epoll_fd, code, + &sr_rec->ev_u.epoll.ctrl_ev); } else { /* legacy fdset (currently unhooked) */ sr_rec->ev_type = SVC_EVENT_FDSET; @@ -377,80 +482,141 @@ svc_rqst_new_evchan(uint32_t *chan_id /* OUT */, void *u_data, uint32_t flags) sr_rec->ev_type = SVC_EVENT_FDSET; #endif - *chan_id = + *chan_id = n_id; + sr_rec->id_k = n_id; sr_rec->ev_flags = flags & SVC_RQST_FLAG_MASK; opr_rbtree_init(&sr_rec->call_expires, svc_rqst_expire_cmpf); - mutex_init(&sr_rec->ev_lock, NULL); - - if (!code) { - atomic_inc_int32_t(&sr_rec->ev_refcnt); - sr_rec->ev_wpe.fun = fun; - sr_rec->ev_wpe.arg = u_data; - work_pool_submit(&svc_work_pool, &sr_rec->ev_wpe); - } - mutex_unlock(&svc_rqst_set.mtx); + atomic_inc_int32_t(&sr_rec->ev_refcnt); + ref_rec++; + sr_rec->ev_wpe.fun = fun; + sr_rec->ev_wpe.arg = u_data; + work_pool_submit(&svc_work_pool, &sr_rec->ev_wpe); __warnx(TIRPC_DEBUG_FLAG_SVC_RQST, "%s: create evchan %d control fd pair (%d:%d)", __func__, n_id, sr_rec->sv[0], sr_rec->sv[1]); - return (code); -} -static inline void -svc_rqst_release(struct svc_rqst_rec *sr_rec) -{ - if (atomic_dec_int32_t(&sr_rec->ev_refcnt) > 0) - return; + if (code != 0) { +fail: + /* Release this event channel id */ + ++(svc_rqst_set.next_id); - __warnx(TIRPC_DEBUG_FLAG_SVC_RQST, - "%s: remove evchan %d control fd pair (%d:%d)", - __func__, sr_rec->id_k, - sr_rec->sv[0], sr_rec->sv[1]); + /* Release sr_rec */ + for (i = 0; i < ref_rec; i++) + svc_rqst_release(sr_rec); + } - mutex_destroy(&sr_rec->ev_lock); + mutex_unlock(&svc_rqst_set.mtx); + + return (code); } /* * may be RPC_DPLX_LOCKED, and SVC_XPRT_FLAG_ADDED cleared */ static inline int -svc_rqst_unhook_events(struct rpc_dplx_rec *rec, struct svc_rqst_rec *sr_rec) +svc_rqst_unhook_events(struct rpc_dplx_rec *rec, struct svc_rqst_rec *sr_rec, + uint16_t ev_flags) { int code = EINVAL; +#ifdef USE_LTTNG_NTIRPC + tracepoint(xprt, unhook, __func__, __LINE__, &rec->xprt, ev_flags); +#endif /* USE_LTTNG_NTIRPC */ + __warnx(TIRPC_DEBUG_FLAG_SVC_RQST, + "%s: xprt %p fd %d ev_flags%s%s%s%s%s%s%s%s%s", + __func__, &rec->xprt, rec->xprt.xp_fd, + ev_flags & SVC_XPRT_FLAG_ADDED_RECV ? " ADDED_RECV" : "", + ev_flags & SVC_XPRT_FLAG_ADDED_SEND ? " ADDED_SEND" : "", + ev_flags & SVC_XPRT_FLAG_INITIAL ? " INITIAL" : "", + ev_flags & SVC_XPRT_FLAG_INITIALIZED ? " INITIALIZED" : "", + ev_flags & SVC_XPRT_FLAG_CLOSE ? " CLOSE" : "", + ev_flags & SVC_XPRT_FLAG_DESTROYING ? " DESTROYING" : "", + ev_flags & SVC_XPRT_FLAG_RELEASING ? " RELEASING" : "", + ev_flags & SVC_XPRT_FLAG_UREG ? " UREG" : "", + sr_rec->ev_flags & SVC_RQST_FLAG_SHUTDOWN + ? "sr_rec->ev_flags SHUTDOWN" : ""); + switch (sr_rec->ev_type) { #if defined(TIRPC_EPOLL) case SVC_EVENT_EPOLL: { - struct epoll_event *ev = &rec->ev_u.epoll.event; + struct epoll_event *ev; + + if (ev_flags & SVC_XPRT_FLAG_ADDED_RECV) { + ev = &rec->ev_u.epoll.event_recv; - /* clear epoll vector */ - code = epoll_ctl(sr_rec->ev_u.epoll.epoll_fd, + /* clear epoll vector */ + code = epoll_ctl(sr_rec->ev_u.epoll.epoll_fd, EPOLL_CTL_DEL, rec->xprt.xp_fd, ev); - if (code) { - code = errno; - __warnx(TIRPC_DEBUG_FLAG_WARN, - "%s: %p fd %d xp_refcnt %" PRId32 - " sr_rec %p evchan %d ev_refcnt %" PRId32 - " epoll_fd %d control fd pair (%d:%d) unhook failed (%d)", - __func__, rec, rec->xprt.xp_fd, - rec->xprt.xp_refcnt, - sr_rec, sr_rec->id_k, sr_rec->ev_refcnt, - sr_rec->ev_u.epoll.epoll_fd, - sr_rec->sv[0], sr_rec->sv[1], code); - } else { - __warnx(TIRPC_DEBUG_FLAG_SVC_RQST | - TIRPC_DEBUG_FLAG_REFCNT, - "%s: %p fd %d xp_refcnt %" PRId32 - " sr_rec %p evchan %d ev_refcnt %" PRId32 - " epoll_fd %d control fd pair (%d:%d) unhook", - __func__, rec, rec->xprt.xp_fd, - rec->xprt.xp_refcnt, - sr_rec, sr_rec->id_k, sr_rec->ev_refcnt, - sr_rec->ev_u.epoll.epoll_fd, - sr_rec->sv[0], sr_rec->sv[1]); + + if (code) { + code = errno; + __warnx(TIRPC_DEBUG_FLAG_WARN, + "%s: %p fd %d xp_refcnt %" PRId32 + " sr_rec %p evchan %d ev_refcnt %" PRId32 + " epoll_fd %d control fd pair (%d:%d) unhook failed (%d)", + __func__, rec, rec->xprt.xp_fd, + rec->xprt.xp_refcnt, + sr_rec, sr_rec->id_k, sr_rec->ev_refcnt, + sr_rec->ev_u.epoll.epoll_fd, + sr_rec->sv[0], sr_rec->sv[1], code); + } else { + __warnx(TIRPC_DEBUG_FLAG_SVC_RQST | + TIRPC_DEBUG_FLAG_REFCNT, + "%s: %p fd %d xp_refcnt %" PRId32 + " sr_rec %p evchan %d ev_refcnt %" PRId32 + " epoll_fd %d control fd pair (%d:%d) unhook event %p", + __func__, rec, rec->xprt.xp_fd, + rec->xprt.xp_refcnt, + sr_rec, sr_rec->id_k, sr_rec->ev_refcnt, + sr_rec->ev_u.epoll.epoll_fd, + sr_rec->sv[0], sr_rec->sv[1], ev); + + atomic_clear_uint16_t_bits( + &rec->xprt.xp_flags, + SVC_XPRT_FLAG_ADDED_RECV); + } + } + + if (ev_flags & SVC_XPRT_FLAG_ADDED_SEND) { + ev = &rec->ev_u.epoll.event_send; + + /* clear epoll vector */ + code = epoll_ctl(sr_rec->ev_u.epoll.epoll_fd, + EPOLL_CTL_DEL, rec->xprt.xp_fd_send, ev); + + if (code) { + code = errno; + __warnx(TIRPC_DEBUG_FLAG_WARN, + "%s: %p fd %d xp_refcnt %" PRId32 + " sr_rec %p evchan %d ev_refcnt %" PRId32 + " epoll_fd %d control fd pair (%d:%d) unhook failed (%d)", + __func__, rec, rec->xprt.xp_fd, + rec->xprt.xp_refcnt, + sr_rec, sr_rec->id_k, sr_rec->ev_refcnt, + sr_rec->ev_u.epoll.epoll_fd, + sr_rec->sv[0], sr_rec->sv[1], code); + } else { + __warnx(TIRPC_DEBUG_FLAG_SVC_RQST | + TIRPC_DEBUG_FLAG_REFCNT, + "%s: %p fd %d xp_refcnt %" PRId32 + " sr_rec %p evchan %d ev_refcnt %" PRId32 + " epoll_fd %d control fd pair (%d:%d) unhook event %p", + __func__, rec, rec->xprt.xp_fd, + rec->xprt.xp_refcnt, + sr_rec, sr_rec->id_k, sr_rec->ev_refcnt, + sr_rec->ev_u.epoll.epoll_fd, + sr_rec->sv[0], sr_rec->sv[1], ev); + + atomic_clear_uint16_t_bits( + &rec->xprt.xp_flags, + SVC_XPRT_FLAG_ADDED_SEND); + close(rec->xprt.xp_fd_send); + rec->xprt.xp_fd_send = -1; + } } break; } @@ -465,16 +631,34 @@ svc_rqst_unhook_events(struct rpc_dplx_rec *rec, struct svc_rqst_rec *sr_rec) } /* - * not locked + * rpc_dplx_rec lock must be held */ int -svc_rqst_rearm_events(SVCXPRT *xprt) +svc_rqst_rearm_events_locked(SVCXPRT *xprt, uint16_t ev_flags) { struct rpc_dplx_rec *rec = REC_XPRT(xprt); - struct svc_rqst_rec *sr_rec = (struct svc_rqst_rec *)rec->ev_p; + struct svc_rqst_rec *sr_rec = rec->ev_p; int code = EINVAL; - if (xprt->xp_flags & (SVC_XPRT_FLAG_ADDED | SVC_XPRT_FLAG_DESTROYED)) +#ifdef USE_LTTNG_NTIRPC + tracepoint(xprt, rearm, __func__, __LINE__, xprt, ev_flags); +#endif /* USE_LTTNG_NTIRPC */ + + __warnx(TIRPC_DEBUG_FLAG_SVC_RQST, + "%s: xprt %p fd %d ev_flags%s%s%s%s%s%s%s%s%s", + __func__, xprt, xprt->xp_fd, + ev_flags & SVC_XPRT_FLAG_ADDED_RECV ? " ADDED_RECV" : "", + ev_flags & SVC_XPRT_FLAG_ADDED_SEND ? " ADDED_SEND" : "", + ev_flags & SVC_XPRT_FLAG_INITIAL ? " INITIAL" : "", + ev_flags & SVC_XPRT_FLAG_INITIALIZED ? " INITIALIZED" : "", + ev_flags & SVC_XPRT_FLAG_CLOSE ? " CLOSE" : "", + ev_flags & SVC_XPRT_FLAG_DESTROYING ? " DESTROYING" : "", + ev_flags & SVC_XPRT_FLAG_RELEASING ? " RELEASING" : "", + ev_flags & SVC_XPRT_FLAG_UREG ? " UREG" : "", + sr_rec->ev_flags & SVC_RQST_FLAG_SHUTDOWN + ? "sr_rec->ev_flags SHUTDOWN" : ""); + + if (xprt->xp_flags & (ev_flags | SVC_XPRT_FLAG_DESTROYED)) return (0); /* MUST follow the destroyed check above */ @@ -482,48 +666,95 @@ svc_rqst_rearm_events(SVCXPRT *xprt) return (0); SVC_REF(xprt, SVC_REF_FLAG_NONE); - rpc_dplx_rli(rec); /* assuming success */ - atomic_set_uint16_t_bits(&xprt->xp_flags, SVC_XPRT_FLAG_ADDED); + atomic_set_uint16_t_bits(&xprt->xp_flags, ev_flags); switch (sr_rec->ev_type) { #if defined(TIRPC_EPOLL) case SVC_EVENT_EPOLL: { - struct epoll_event *ev = &rec->ev_u.epoll.event; - - /* set up epoll user data */ - ev->events = EPOLLIN | EPOLLONESHOT; + struct epoll_event *ev; + + if (ev_flags & SVC_XPRT_FLAG_ADDED_RECV) { + ev = &rec->ev_u.epoll.event_recv; + + /* set up epoll user data */ + ev->events = EPOLLIN | EPOLLONESHOT; + + /* rearm in epoll vector */ + code = epoll_ctl(sr_rec->ev_u.epoll.epoll_fd, + EPOLL_CTL_MOD, rec->xprt.xp_fd, ev); + if (code) { + code = errno; + atomic_clear_uint16_t_bits( + &xprt->xp_flags, + SVC_XPRT_FLAG_ADDED_RECV); + __warnx(TIRPC_DEBUG_FLAG_ERROR, + "%s: %p fd %d xp_refcnt %" PRId32 + " sr_rec %p evchan %d ev_refcnt %" PRId32 + " epoll_fd %d control fd pair (%d:%d) rearm failed (%d)", + __func__, rec, rec->xprt.xp_fd, + rec->xprt.xp_refcnt, + sr_rec, sr_rec->id_k, sr_rec->ev_refcnt, + sr_rec->ev_u.epoll.epoll_fd, + sr_rec->sv[0], sr_rec->sv[1], code); + SVC_RELEASE(xprt, SVC_RELEASE_FLAG_NONE); + } else { + __warnx(TIRPC_DEBUG_FLAG_SVC_RQST | + TIRPC_DEBUG_FLAG_REFCNT, + "%s: %p fd %d xp_refcnt %" PRId32 + " sr_rec %p evchan %d ev_refcnt %"PRId32 + " epoll_fd %d control fd pair (%d:%d) rearm event %p", + __func__, rec, rec->xprt.xp_fd, + rec->xprt.xp_refcnt, + sr_rec, sr_rec->id_k, sr_rec->ev_refcnt, + sr_rec->ev_u.epoll.epoll_fd, + sr_rec->sv[0], sr_rec->sv[1], ev); + } + } - /* rearm in epoll vector */ - code = epoll_ctl(sr_rec->ev_u.epoll.epoll_fd, - EPOLL_CTL_MOD, xprt->xp_fd, ev); - if (code) { - code = errno; - atomic_clear_uint16_t_bits(&xprt->xp_flags, - SVC_XPRT_FLAG_ADDED); - __warnx(TIRPC_DEBUG_FLAG_ERROR, - "%s: %p fd %d xp_refcnt %" PRId32 - " sr_rec %p evchan %d ev_refcnt %" PRId32 - " epoll_fd %d control fd pair (%d:%d) rearm failed (%d)", - __func__, rec, rec->xprt.xp_fd, - rec->xprt.xp_refcnt, - sr_rec, sr_rec->id_k, sr_rec->ev_refcnt, - sr_rec->ev_u.epoll.epoll_fd, - sr_rec->sv[0], sr_rec->sv[1], code); - SVC_RELEASE(xprt, SVC_RELEASE_FLAG_NONE); - } else { - __warnx(TIRPC_DEBUG_FLAG_SVC_RQST | - TIRPC_DEBUG_FLAG_REFCNT, - "%s: %p fd %d xp_refcnt %" PRId32 - " sr_rec %p evchan %d ev_refcnt %" PRId32 - " epoll_fd %d control fd pair (%d:%d) rearm", - __func__, rec, rec->xprt.xp_fd, - rec->xprt.xp_refcnt, - sr_rec, sr_rec->id_k, sr_rec->ev_refcnt, - sr_rec->ev_u.epoll.epoll_fd, - sr_rec->sv[0], sr_rec->sv[1]); + if (ev_flags & SVC_XPRT_FLAG_ADDED_SEND) { + ev = &rec->ev_u.epoll.event_recv; + + /* set up epoll user data */ + ev->data.ptr = rec; + + /* wait for write events, edge triggered, oneshot */ + ev->events = EPOLLONESHOT | EPOLLOUT | EPOLLET; + + /* rearm in epoll vector */ + code = epoll_ctl(sr_rec->ev_u.epoll.epoll_fd, + EPOLL_CTL_MOD, rec->xprt.xp_fd_send, + ev); + + if (code) { + code = errno; + atomic_clear_uint16_t_bits( + &xprt->xp_flags, + SVC_XPRT_FLAG_ADDED_SEND); + __warnx(TIRPC_DEBUG_FLAG_ERROR, + "%s: %p fd %d xp_refcnt %" PRId32 + " sr_rec %p evchan %d ev_refcnt %" PRId32 + " epoll_fd %d control fd pair (%d:%d) rearm failed (%d)", + __func__, rec, rec->xprt.xp_fd, + rec->xprt.xp_refcnt, + sr_rec, sr_rec->id_k, sr_rec->ev_refcnt, + sr_rec->ev_u.epoll.epoll_fd, + sr_rec->sv[0], sr_rec->sv[1], code); + SVC_RELEASE(xprt, SVC_RELEASE_FLAG_NONE); + } else { + __warnx(TIRPC_DEBUG_FLAG_SVC_RQST | + TIRPC_DEBUG_FLAG_REFCNT, + "%s: %p fd %d xp_refcnt %" PRId32 + " sr_rec %p evchan %d ev_refcnt %"PRId32 + " epoll_fd %d control fd pair (%d:%d) rearm event %p", + __func__, rec, rec->xprt.xp_fd, + rec->xprt.xp_refcnt, + sr_rec, sr_rec->id_k, sr_rec->ev_refcnt, + sr_rec->ev_u.epoll.epoll_fd, + sr_rec->sv[0], sr_rec->sv[1], ev); + } } break; } @@ -534,8 +765,6 @@ svc_rqst_rearm_events(SVCXPRT *xprt) break; } /* switch */ - rpc_dplx_rui(rec); - return (code); } @@ -543,49 +772,117 @@ svc_rqst_rearm_events(SVCXPRT *xprt) * RPC_DPLX_LOCKED, and SVC_XPRT_FLAG_ADDED set */ static inline int -svc_rqst_hook_events(struct rpc_dplx_rec *rec, struct svc_rqst_rec *sr_rec) +svc_rqst_hook_events(struct rpc_dplx_rec *rec, struct svc_rqst_rec *sr_rec, + uint16_t ev_flags) { int code = EINVAL; +#ifdef USE_LTTNG_NTIRPC + tracepoint(xprt, hook, __func__, __LINE__, &rec->xprt, ev_flags); +#endif /* USE_LTTNG_NTIRPC */ + + __warnx(TIRPC_DEBUG_FLAG_SVC_RQST, + "%s: xprt %p fd %d ev_flags%s%s%s%s%s%s%s%s%s", + __func__, &rec->xprt, rec->xprt.xp_fd, + ev_flags & SVC_XPRT_FLAG_ADDED_RECV ? " ADDED_RECV" : "", + ev_flags & SVC_XPRT_FLAG_ADDED_SEND ? " ADDED_SEND" : "", + ev_flags & SVC_XPRT_FLAG_INITIAL ? " INITIAL" : "", + ev_flags & SVC_XPRT_FLAG_INITIALIZED ? " INITIALIZED" : "", + ev_flags & SVC_XPRT_FLAG_CLOSE ? " CLOSE" : "", + ev_flags & SVC_XPRT_FLAG_DESTROYING ? " DESTROYING" : "", + ev_flags & SVC_XPRT_FLAG_RELEASING ? " RELEASING" : "", + ev_flags & SVC_XPRT_FLAG_UREG ? " UREG" : "", + sr_rec->ev_flags & SVC_RQST_FLAG_SHUTDOWN + ? "sr_rec->ev_flags SHUTDOWN" : ""); + + /* assuming success */ + atomic_set_uint16_t_bits(&rec->xprt.xp_flags, ev_flags); + switch (sr_rec->ev_type) { #if defined(TIRPC_EPOLL) case SVC_EVENT_EPOLL: { - struct epoll_event *ev = &rec->ev_u.epoll.event; - - /* set up epoll user data */ - ev->data.ptr = rec; - - /* wait for read events, level triggered, oneshot */ - ev->events = EPOLLIN | EPOLLONESHOT; + struct epoll_event *ev; + + if (ev_flags & SVC_XPRT_FLAG_ADDED_RECV) { + ev = &rec->ev_u.epoll.event_recv; + + /* set up epoll user data */ + ev->data.ptr = rec; + + /* wait for read events, level triggered, oneshot */ + ev->events = EPOLLONESHOT | EPOLLIN; + + /* add to epoll vector */ + code = epoll_ctl(sr_rec->ev_u.epoll.epoll_fd, + EPOLL_CTL_ADD, rec->xprt.xp_fd, ev); + if (code) { + code = errno; + atomic_clear_uint16_t_bits( + &rec->xprt.xp_flags, + SVC_XPRT_FLAG_ADDED_RECV); + __warnx(TIRPC_DEBUG_FLAG_ERROR, + "%s: %p fd %d xp_refcnt %" PRId32 + " sr_rec %p evchan %d ev_refcnt %" PRId32 + " epoll_fd %d control fd pair (%d:%d) direction in hook failed (%d)", + __func__, rec, rec->xprt.xp_fd, + rec->xprt.xp_refcnt, + sr_rec, sr_rec->id_k, sr_rec->ev_refcnt, + sr_rec->ev_u.epoll.epoll_fd, + sr_rec->sv[0], sr_rec->sv[1], code); + } else { + __warnx(TIRPC_DEBUG_FLAG_SVC_RQST | + TIRPC_DEBUG_FLAG_REFCNT, + "%s: %p fd %d xp_refcnt %" PRId32 + " sr_rec %p evchan %d ev_refcnt %" PRId32 + " epoll_fd %d control fd pair (%d:%d) direction in hook event %p", + __func__, rec, rec->xprt.xp_fd, + rec->xprt.xp_refcnt, + sr_rec, sr_rec->id_k, sr_rec->ev_refcnt, + sr_rec->ev_u.epoll.epoll_fd, + sr_rec->sv[0], sr_rec->sv[1], ev); + } + } - /* add to epoll vector */ - code = epoll_ctl(sr_rec->ev_u.epoll.epoll_fd, - EPOLL_CTL_ADD, rec->xprt.xp_fd, ev); - if (code) { - code = errno; - atomic_clear_uint16_t_bits(&rec->xprt.xp_flags, - SVC_XPRT_FLAG_ADDED); - __warnx(TIRPC_DEBUG_FLAG_ERROR, - "%s: %p fd %d xp_refcnt %" PRId32 - " sr_rec %p evchan %d ev_refcnt %" PRId32 - " epoll_fd %d control fd pair (%d:%d) hook failed (%d)", - __func__, rec, rec->xprt.xp_fd, - rec->xprt.xp_refcnt, - sr_rec, sr_rec->id_k, sr_rec->ev_refcnt, - sr_rec->ev_u.epoll.epoll_fd, - sr_rec->sv[0], sr_rec->sv[1], code); - } else { - __warnx(TIRPC_DEBUG_FLAG_SVC_RQST | - TIRPC_DEBUG_FLAG_REFCNT, - "%s: %p fd %d xp_refcnt %" PRId32 - " sr_rec %p evchan %d ev_refcnt %" PRId32 - " epoll_fd %d control fd pair (%d:%d) hook", - __func__, rec, rec->xprt.xp_fd, - rec->xprt.xp_refcnt, - sr_rec, sr_rec->id_k, sr_rec->ev_refcnt, - sr_rec->ev_u.epoll.epoll_fd, - sr_rec->sv[0], sr_rec->sv[1]); + if (ev_flags & SVC_XPRT_FLAG_ADDED_SEND) { + ev = &rec->ev_u.epoll.event_send; + + /* set up epoll user data */ + ev->data.ptr = rec; + + /* wait for write events, edge triggered, oneshot */ + ev->events = EPOLLONESHOT | EPOLLOUT | EPOLLET; + + /* add to epoll vector */ + code = epoll_ctl(sr_rec->ev_u.epoll.epoll_fd, + EPOLL_CTL_ADD, rec->xprt.xp_fd_send, + ev); + if (code) { + code = errno; + atomic_clear_uint16_t_bits( + &rec->xprt.xp_flags, + SVC_XPRT_FLAG_ADDED_SEND); + __warnx(TIRPC_DEBUG_FLAG_ERROR, + "%s: %p fd %d xp_refcnt %" PRId32 + " sr_rec %p evchan %d ev_refcnt %" PRId32 + " epoll_fd %d control fd pair (%d:%d) direction out hook failed (%d)", + __func__, rec, rec->xprt.xp_fd, + rec->xprt.xp_refcnt, + sr_rec, sr_rec->id_k, sr_rec->ev_refcnt, + sr_rec->ev_u.epoll.epoll_fd, + sr_rec->sv[0], sr_rec->sv[1], code); + } else { + __warnx(TIRPC_DEBUG_FLAG_SVC_RQST | + TIRPC_DEBUG_FLAG_REFCNT, + "%s: %p fd %d xp_refcnt %" PRId32 + " sr_rec %p evchan %d ev_refcnt %" PRId32 + " epoll_fd %d control fd pair (%d:%d) direction out hook event %p", + __func__, rec, rec->xprt.xp_fd, + rec->xprt.xp_refcnt, + sr_rec, sr_rec->id_k, sr_rec->ev_refcnt, + sr_rec->ev_u.epoll.epoll_fd, + sr_rec->sv[0], sr_rec->sv[1], ev); + } } break; } @@ -596,6 +893,10 @@ svc_rqst_hook_events(struct rpc_dplx_rec *rec, struct svc_rqst_rec *sr_rec) break; } /* switch */ + __warnx(TIRPC_DEBUG_FLAG_SVC_RQST, + "%s: sv[0] fd %d before ev_sig (sr_rec %p)", + __func__, sr_rec->sv[0], + sr_rec); ev_sig(sr_rec->sv[0], 0); /* send wakeup */ return (code); @@ -607,18 +908,119 @@ svc_rqst_hook_events(struct rpc_dplx_rec *rec, struct svc_rqst_rec *sr_rec) static void svc_rqst_unreg(struct rpc_dplx_rec *rec, struct svc_rqst_rec *sr_rec) { - uint16_t xp_flags = atomic_postclear_uint16_t_bits(&rec->xprt.xp_flags, - SVC_XPRT_FLAG_ADDED); + uint16_t xp_flags = + atomic_postclear_uint16_t_bits(&rec->xprt.xp_flags, + SVC_XPRT_FLAG_ADDED_RECV | + SVC_XPRT_FLAG_ADDED_SEND); /* clear events */ - if (xp_flags & SVC_XPRT_FLAG_ADDED) - (void)svc_rqst_unhook_events(rec, sr_rec); + if (xp_flags & (SVC_XPRT_FLAG_ADDED_RECV | SVC_XPRT_FLAG_ADDED_SEND)) + (void)svc_rqst_unhook_events(rec, sr_rec, xp_flags); /* Unlinking after debug message ensures both the xprt and the sr_rec * are still present, as the xprt unregisters before release. */ - rec->ev_p = NULL; - svc_rqst_release(sr_rec); + if (rec->ev_p == sr_rec) { + rec->ev_p = NULL; + svc_rqst_release(sr_rec); + } +} + +void svc_rqst_xprt_send_complete(SVCXPRT *xprt) +{ + struct rpc_dplx_rec *rec = REC_XPRT(xprt); + struct svc_rqst_rec *sr_rec; + + sr_rec = rec->ev_p; + + if (!sr_rec) { + __warnx(TIRPC_DEBUG_FLAG_ERROR, + "%s: %p has no attached sr_rec", + __func__, xprt); + return; + } + + (void)svc_rqst_unhook_events(rec, sr_rec, SVC_XPRT_FLAG_ADDED_SEND); +} + +int +svc_rqst_evchan_write(SVCXPRT *xprt, struct xdr_ioq *xioq, bool has_blocked) +{ + struct rpc_dplx_rec *rec = REC_XPRT(xprt); + struct svc_rqst_rec *sr_rec; + int code; + + __warnx(TIRPC_DEBUG_FLAG_SVC_RQST, + "%s: xprt %p xioq %p has_blocked %s", + __func__, xprt, xioq, has_blocked ? "TRUE" : "FALSE"); + + sr_rec = rec->ev_p; + + if (!sr_rec) { + __warnx(TIRPC_DEBUG_FLAG_ERROR, + "%s: %p has no attached sr_rec", + __func__, xprt); + return (ENOENT); + } + + rec->ev_u.epoll.xioq_send = xioq; + +#if defined(TIRPC_EPOLL) + if (sr_rec->ev_type == SVC_EVENT_EPOLL) { + /* For send we need to dup the xprt fd */ + if (xprt->xp_fd_send == -1) { + xprt->xp_fd_send = dup(xprt->xp_fd); + + if (xprt->xp_fd_send< 0) { + code = errno; + __warnx(TIRPC_DEBUG_FLAG_ERROR, + "%s: failed duplicating fd (%d)", + __func__, code); + goto out; + } + + __warnx(TIRPC_DEBUG_FLAG_SVC_RQST, + "%s: xp_fd_send fd %d dup of xp_fd %d", + __func__, xprt->xp_fd_send, xprt->xp_fd); + } + } +#endif + + rpc_dplx_rli(rec); + + /* register on event channel */ + if (has_blocked) { + code = svc_rqst_rearm_events_locked(xprt, + SVC_XPRT_FLAG_ADDED_SEND); + } else { + code = svc_rqst_hook_events(rec, sr_rec, + SVC_XPRT_FLAG_ADDED_SEND); + } + + if (code) { + __warnx(TIRPC_DEBUG_FLAG_ERROR, + "%s: failed hooking events (%d)", + __func__, code); + goto out; + } + + atomic_inc_int32_t(&sr_rec->ev_refcnt); + work_pool_submit(&svc_work_pool, &sr_rec->ev_wpe); + + __warnx(TIRPC_DEBUG_FLAG_SVC_RQST, + "%s: create evchan write control fd pair (%d:%d)", + __func__, + sr_rec->sv[0], sr_rec->sv[1]); + +out: + + if (code != 0) { + svc_rqst_release(sr_rec); + } + + rpc_dplx_rui(rec); + + return (code); } /* @@ -629,9 +1031,8 @@ svc_rqst_evchan_reg(uint32_t chan_id, SVCXPRT *xprt, uint32_t flags) { struct rpc_dplx_rec *rec = REC_XPRT(xprt); struct svc_rqst_rec *sr_rec; - struct svc_rqst_rec *ev_p; int code; - uint16_t bits = SVC_XPRT_FLAG_ADDED | (flags & SVC_XPRT_FLAG_UREG); + uint16_t bits = SVC_XPRT_FLAG_ADDED_RECV | (flags & SVC_XPRT_FLAG_UREG); if (chan_id == 0) { /* Create a global/legacy event channel */ @@ -648,6 +1049,7 @@ svc_rqst_evchan_reg(uint32_t chan_id, SVCXPRT *xprt, uint32_t flags) } sr_rec = svc_rqst_lookup_chan(chan_id); + if (!sr_rec) { __warnx(TIRPC_DEBUG_FLAG_ERROR, "%s: %p unknown evchan %d", @@ -658,9 +1060,8 @@ svc_rqst_evchan_reg(uint32_t chan_id, SVCXPRT *xprt, uint32_t flags) if (!(flags & RPC_DPLX_LOCKED)) rpc_dplx_rli(rec); - ev_p = (struct svc_rqst_rec *)rec->ev_p; - if (ev_p) { - if (ev_p == sr_rec) { + if (rec->ev_p) { + if (rec->ev_p == sr_rec) { if (!(flags & RPC_DPLX_LOCKED)) rpc_dplx_rui(rec); __warnx(TIRPC_DEBUG_FLAG_SVC_RQST, @@ -668,17 +1069,17 @@ svc_rqst_evchan_reg(uint32_t chan_id, SVCXPRT *xprt, uint32_t flags) __func__, xprt, chan_id); return (0); } - svc_rqst_unreg(rec, ev_p); + __warnx(TIRPC_DEBUG_FLAG_SVC_RQST, + "%s: %p unregistering evchan %d", + __func__, xprt, rec->ev_p->id_k); + svc_rqst_unreg(rec, rec->ev_p); } - /* assuming success */ - atomic_set_uint16_t_bits(&xprt->xp_flags, bits); - /* link from xprt */ rec->ev_p = sr_rec; - /* register on event channel */ - code = svc_rqst_hook_events(rec, sr_rec); + /* register sr_rec on event channel */ + code = svc_rqst_hook_events(rec, sr_rec, bits); if (!(flags & RPC_DPLX_LOCKED)) rpc_dplx_rui(rec); @@ -700,7 +1101,7 @@ svc_rqst_xprt_register(SVCXPRT *newxprt, SVCXPRT *xprt) newxprt, SVC_RQST_FLAG_CHAN_AFFINITY); - sr_rec = (struct svc_rqst_rec *) REC_XPRT(xprt)->ev_p; + sr_rec = REC_XPRT(xprt)->ev_p; /* or if parent xprt has no dedicated event channel */ if (!sr_rec) @@ -756,20 +1157,28 @@ svc_rqst_xprt_unregister(SVCXPRT *xprt, uint32_t flags) } /*static*/ void -svc_rqst_xprt_task(struct work_pool_entry *wpe) +svc_rqst_xprt_task_recv(struct work_pool_entry *wpe) { - struct rpc_dplx_rec *rec = - opr_containerof(wpe, struct rpc_dplx_rec, ioq.ioq_wpe); + struct xdr_ioq *ioq = + opr_containerof(wpe, struct xdr_ioq, ioq_wpe); + struct rpc_dplx_rec *rec = ioq->rec; - atomic_clear_uint16_t_bits(&rec->ioq.ioq_s.qflags, IOQ_FLAG_WORKING); + atomic_clear_uint16_t_bits(&ioq->ioq_s.qflags, IOQ_FLAG_WORKING); + +#ifdef USE_LTTNG_NTIRPC + tracepoint(xprt, recv, __func__, __LINE__, + &rec->xprt, + (unsigned int)(rec->xprt.xp_flags & SVC_XPRT_FLAG_DESTROYED), + (unsigned int) rec->xprt.xp_refcnt); +#endif /* USE_LTTNG_NTIRPC */ /* atomic barrier (above) should protect following values */ if (rec->xprt.xp_refcnt > 1 - && !(rec->xprt.xp_flags & SVC_XPRT_FLAG_DESTROYED)) { + && !(rec->xprt.xp_flags & SVC_XPRT_FLAG_DESTROYED)) { /* (idempotent) xp_flags and xp_refcnt are set atomic. * xp_refcnt need more than 1 (this task). */ - (void)clock_gettime(CLOCK_MONOTONIC_FAST, &(rec->recv.ts)); + (void)clock_gettime(CLOCK_MONOTONIC_FAST, &rec->recv.ts); (void)SVC_RECV(&rec->xprt); } @@ -848,6 +1257,36 @@ void svc_resume(struct svc_req *req) work_pool_submit(&svc_work_pool, &(rpc_dplx_rec->ioq.ioq_wpe)); } +/*static*/ void +svc_rqst_xprt_task_send(struct work_pool_entry *wpe) +{ + struct xdr_ioq *ioq = + opr_containerof(wpe, struct xdr_ioq, ioq_wpe); + struct rpc_dplx_rec *rec = ioq->rec; + + atomic_clear_uint16_t_bits(&ioq->ioq_s.qflags, IOQ_FLAG_WORKING); + +#ifdef USE_LTTNG_NTIRPC + tracepoint(xprt, send, __func__, __LINE__, + &rec->xprt, + (unsigned int)(rec->xprt.xp_flags & SVC_XPRT_FLAG_DESTROYED), + (unsigned int) rec->xprt.xp_refcnt); +#endif /* USE_LTTNG_NTIRPC */ + + /* atomic barrier (above) should protect following values */ + if (rec->xprt.xp_refcnt > 1 + && !(rec->xprt.xp_flags & SVC_XPRT_FLAG_DESTROYED)) { + /* (idempotent) xp_flags and xp_refcnt are set atomic. + * xp_refcnt need more than 1 (this task). + */ + svc_ioq_write(&rec->xprt); + } + + /* If tests fail, log non-fatal "WARNING! already destroying!" + */ + SVC_RELEASE(&rec->xprt, SVC_RELEASE_FLAG_NONE); +} + /* * Like __svc_clean_idle but event-type independent. For now no cleanfds. */ @@ -917,11 +1356,13 @@ svc_rqst_clean_idle(int timeout) #ifdef TIRPC_EPOLL -static struct rpc_dplx_rec * +static struct xdr_ioq * svc_rqst_epoll_event(struct svc_rqst_rec *sr_rec, struct epoll_event *ev) { struct rpc_dplx_rec *rec = (struct rpc_dplx_rec *) ev->data.ptr; - uint16_t xp_flags; + uint16_t xp_flags, ev_flag = 0; + struct xdr_ioq *ioq = NULL; + work_pool_fun_t fun; if (unlikely(ev->data.fd == sr_rec->sv[1])) { /* signalled -- there was a wakeup on ctrl_ev (see @@ -944,29 +1385,62 @@ svc_rqst_epoll_event(struct svc_rqst_rec *sr_rec, struct epoll_event *ev) * without taking another one. */ + __warnx(TIRPC_DEBUG_FLAG_SVC_RQST, + "%s: event %p %08x%s%s rpc_dplx_rec %p (sr_rec %p)", + __func__, ev, ev->events, + ev->events & EPOLLIN ? " RECV" : "", + ev->events & EPOLLOUT ? " SEND" : "", + rec, sr_rec); + + if (ev->events & EPOLLIN) { + /* This is a RECV event */ + ev_flag = SVC_XPRT_FLAG_ADDED_RECV; + ioq = &rec->ioq; + fun = svc_rqst_xprt_task_recv; + } else if (ev->events & EPOLLOUT) { + /* This is a SEND event */ + ev_flag = SVC_XPRT_FLAG_ADDED_SEND; + ioq = rec->ev_u.epoll.xioq_send; + fun = svc_rqst_xprt_task_send; + } else { + /* This is some other event... */ + SVC_RELEASE(&rec->xprt, SVC_RELEASE_FLAG_NONE); + return NULL; + } + /* MUST handle flags after reference. * Although another task may unhook, the error is non-fatal. */ - xp_flags = atomic_postclear_uint16_t_bits(&rec->xprt.xp_flags, - SVC_XPRT_FLAG_ADDED); + xp_flags = atomic_postclear_uint16_t_bits(&rec->xprt.xp_flags, ev_flag); __warnx(TIRPC_DEBUG_FLAG_SVC_RQST | TIRPC_DEBUG_FLAG_REFCNT, "%s: %p fd %d xp_refcnt %" PRId32 - " event %d", + " event %08x xp_flags%s%s clear flag%s%s", __func__, rec, rec->xprt.xp_fd, rec->xprt.xp_refcnt, - ev->events); + ev->events, + xp_flags & SVC_XPRT_FLAG_ADDED_RECV ? " ADDED_RECV" : "", + xp_flags & SVC_XPRT_FLAG_ADDED_SEND ? " ADDED_SEND" : "", + ev_flag & SVC_XPRT_FLAG_ADDED_RECV ? " ADDED_RECV" : "", + ev_flag & SVC_XPRT_FLAG_ADDED_SEND ? " ADDED_SEND" : ""); + +#ifdef USE_LTTNG_NTIRPC + tracepoint(xprt, event, __func__, __LINE__, &rec->xprt, xp_flags, + ev_flag); +#endif /* USE_LTTNG_NTIRPC */ if (rec->xprt.xp_refcnt > 1 - && (xp_flags & SVC_XPRT_FLAG_ADDED) - && !(xp_flags & SVC_XPRT_FLAG_DESTROYED) - && !(atomic_postset_uint16_t_bits(&rec->ioq.ioq_s.qflags, - IOQ_FLAG_WORKING) - & IOQ_FLAG_WORKING)) { + && (xp_flags & ev_flag) + && !(xp_flags & SVC_XPRT_FLAG_DESTROYED) + && !(atomic_postset_uint16_t_bits(&ioq->ioq_s.qflags, + IOQ_FLAG_WORKING) + & IOQ_FLAG_WORKING)) { /* (idempotent) xp_flags and xp_refcnt are set atomic. * xp_refcnt need more than 1 (this event). */ - return (rec); + ioq->ioq_wpe.fun = fun; + ioq->rec = rec; + return ioq; } /* Do not return destroyed transports. @@ -979,39 +1453,38 @@ svc_rqst_epoll_event(struct svc_rqst_rec *sr_rec, struct epoll_event *ev) /* * not locked */ -static inline struct rpc_dplx_rec * +static inline struct xdr_ioq * svc_rqst_epoll_events(struct svc_rqst_rec *sr_rec, int n_events) { - struct rpc_dplx_rec *rec = NULL; + struct xdr_ioq *ioq = NULL; int ix = 0; + /* Find the first RECV or SEND event */ while (ix < n_events) { - rec = svc_rqst_epoll_event(sr_rec, - &(sr_rec->ev_u.epoll.events[ix++])); - if (rec) + ioq = svc_rqst_epoll_event(sr_rec, + &sr_rec->ev_u.epoll.events[ix++]); + if (ioq) break; } - if (!rec) { + if (!ioq) { /* continue waiting for events with this task */ return NULL; } while (ix < n_events) { - struct rpc_dplx_rec *rec = svc_rqst_epoll_event(sr_rec, + /* Queue up additional RECV or SEND events */ + struct xdr_ioq *ioq = svc_rqst_epoll_event(sr_rec, &(sr_rec->ev_u.epoll.events[ix++])); - if (!rec) - continue; - - rec->ioq.ioq_wpe.fun = svc_rqst_xprt_task; - work_pool_submit(&svc_work_pool, &(rec->ioq.ioq_wpe)); + if (ioq) + work_pool_submit(&svc_work_pool, &ioq->ioq_wpe); } /* submit another task to handle events in order */ atomic_inc_int32_t(&sr_rec->ev_refcnt); work_pool_submit(&svc_work_pool, &sr_rec->ev_wpe); - return rec; + return ioq; } static void svc_rqst_epoll_loop(struct work_pool_entry *wpe) @@ -1077,15 +1550,22 @@ static void svc_rqst_epoll_loop(struct work_pool_entry *wpe) break; } if (n_events > 0) { + __warnx(TIRPC_DEBUG_FLAG_SVC_RQST | + TIRPC_DEBUG_FLAG_REFCNT, + "%s: sr_rec %p evchan %d ev_refcnt %" PRId32 + " epoll_fd %d n_events %d", + __func__, + sr_rec, sr_rec->id_k, sr_rec->ev_refcnt, + sr_rec->ev_u.epoll.epoll_fd, n_events); + atomic_add_uint32_t(&wakeups, n_events); - struct rpc_dplx_rec *rec; + struct xdr_ioq *ioq; - rec = svc_rqst_epoll_events(sr_rec, n_events); + ioq = svc_rqst_epoll_events(sr_rec, n_events); - if (rec != NULL) { + if (ioq != NULL) { /* use this hot thread for the first event */ - rec->ioq.ioq_wpe.fun = svc_rqst_xprt_task; - svc_rqst_xprt_task(&(rec->ioq.ioq_wpe)); + ioq->ioq_wpe.fun(&ioq->ioq_wpe); /* failsafe idle processing after work task */ if (atomic_postclear_uint32_t_bits( @@ -1101,6 +1581,13 @@ static void svc_rqst_epoll_loop(struct work_pool_entry *wpe) } if (!n_events) { /* timed out (idle) */ + __warnx(TIRPC_DEBUG_FLAG_SVC_RQST | + TIRPC_DEBUG_FLAG_REFCNT, + "%s: sr_rec %p evchan %d ev_refcnt %" PRId32 + " epoll_fd %d idle", + __func__, + sr_rec, sr_rec->id_k, sr_rec->ev_refcnt, + sr_rec->ev_u.epoll.epoll_fd); atomic_inc_uint32_t(&wakeups); continue; } @@ -1116,6 +1603,14 @@ static void svc_rqst_epoll_loop(struct work_pool_entry *wpe) } } if (finished) { + __warnx(TIRPC_DEBUG_FLAG_SVC_RQST | + TIRPC_DEBUG_FLAG_REFCNT, + "%s: sr_rec %p evchan %d ev_refcnt %" PRId32 + " epoll_fd %d finished", + __func__, + sr_rec, sr_rec->id_k, sr_rec->ev_refcnt, + sr_rec->ev_u.epoll.epoll_fd); + close(sr_rec->ev_u.epoll.epoll_fd); mem_free(sr_rec->ev_u.epoll.events, sr_rec->ev_u.epoll.max_events * @@ -1152,11 +1647,12 @@ svc_rqst_thrd_signal(uint32_t chan_id, uint32_t flags) return (ENOENT); } + __warnx(TIRPC_DEBUG_FLAG_SVC_RQST, + "%s: sv[0] fd %d before ev_sig (sr_rec %p) evchan %d", + __func__, sr_rec->sv[0], + sr_rec, chan_id); ev_sig(sr_rec->sv[0], flags); /* send wakeup */ - __warnx(TIRPC_DEBUG_FLAG_ERROR, - "%s: signalled evchan %d", - __func__, chan_id); svc_rqst_release(sr_rec); return (0); } @@ -1172,6 +1668,10 @@ svc_rqst_delete_evchan(uint32_t chan_id) return (ENOENT); } atomic_set_uint16_t_bits(&sr_rec->ev_flags, SVC_RQST_FLAG_SHUTDOWN); + __warnx(TIRPC_DEBUG_FLAG_SVC_RQST, + "%s: sv[0] fd %d before ev_sig (sr_rec %p)", + __func__, sr_rec->sv[0], + sr_rec); ev_sig(sr_rec->sv[0], SVC_RQST_FLAG_SHUTDOWN); svc_rqst_release(sr_rec); diff --git a/src/svc_vc.c b/src/svc_vc.c index 2d8fbd29cc..dd2980bbc0 100644 --- a/src/svc_vc.c +++ b/src/svc_vc.c @@ -414,6 +414,10 @@ svc_vc_rendezvous(SVCXPRT *xprt) static int n = 1; struct timeval timeval; +#ifdef USE_LTTNG_NTIRPC + tracepoint(xprt, funcin, __func__, __LINE__, xprt); +#endif /* USE_LTTNG_NTIRPC */ + again: len = sizeof(addr); fd = accept(xprt->xp_fd, (struct sockaddr *)(void *)&addr, &len); @@ -438,7 +442,7 @@ svc_vc_rendezvous(SVCXPRT *xprt) } return (XPRT_DIED); } - if (unlikely(svc_rqst_rearm_events(xprt))) { + if (unlikely(svc_rqst_rearm_events(xprt, SVC_XPRT_FLAG_ADDED_RECV))) { __warnx(TIRPC_DEBUG_FLAG_ERROR, "%s: %p fd %d svc_rqst_rearm_events failed (will set dead)", __func__, xprt, xprt->xp_fd); @@ -538,6 +542,9 @@ svc_vc_destroy_task(struct work_pool_entry *wpe) if ((xp_flags & SVC_XPRT_FLAG_CLOSE) && rec->xprt.xp_fd != RPC_ANYFD) { (void)close(rec->xprt.xp_fd); + __warnx(TIRPC_DEBUG_FLAG_SVC_VC, + "%s: fd %d closed", + __func__, rec->xprt.xp_fd); rec->xprt.xp_fd = RPC_ANYFD; } @@ -663,6 +670,10 @@ svc_vc_recv(SVCXPRT *xprt) u_int flags; int code; +#ifdef USE_LTTNG_NTIRPC + tracepoint(xprt, funcin, __func__, __LINE__, xprt); +#endif /* USE_LTTNG_NTIRPC */ + /* no need for locking, only one svc_rqst_xprt_task() per event. * depends upon svc_rqst_rearm_events() for ordering. */ @@ -687,19 +698,30 @@ svc_vc_recv(SVCXPRT *xprt) __warnx(TIRPC_DEBUG_FLAG_WARN, "%s: %p fd %d recv errno %d (try again)", "svc_vc_wait", xprt, xprt->xp_fd, code); - if (unlikely(svc_rqst_rearm_events(xprt))) { + if (unlikely(svc_rqst_rearm_events( + xprt, + SVC_XPRT_FLAG_ADDED_RECV))) { __warnx(TIRPC_DEBUG_FLAG_ERROR, "%s: %p fd %d svc_rqst_rearm_events failed (will set dead)", "svc_vc_wait", xprt, xprt->xp_fd); SVC_DESTROY(xprt); + code = EINVAL; } +#ifdef USE_LTTNG_NTIRPC + tracepoint(xprt, recv_exit, __func__, __LINE__, + xprt, "EAGAIN", code); +#endif /* USE_LTTNG_NTIRPC */ return SVC_STAT(xprt); } __warnx(TIRPC_DEBUG_FLAG_WARN, "%s: %p fd %d recv errno %d (will set dead)", "svc_vc_wait", xprt, xprt->xp_fd, code); SVC_DESTROY(xprt); +#ifdef USE_LTTNG_NTIRPC + tracepoint(xprt, recv_exit, __func__, __LINE__, + xprt, "ERROR", code); +#endif /* USE_LTTNG_NTIRPC */ return SVC_STAT(xprt); } @@ -708,6 +730,10 @@ svc_vc_recv(SVCXPRT *xprt) "%s: %p fd %d recv closed (will set dead)", "svc_vc_wait", xprt, xprt->xp_fd); SVC_DESTROY(xprt); +#ifdef USE_LTTNG_NTIRPC + tracepoint(xprt, recv_exit, __func__, __LINE__, + xprt, "EMPTY", 0); +#endif /* USE_LTTNG_NTIRPC */ return SVC_STAT(xprt); } @@ -724,9 +750,17 @@ svc_vc_recv(SVCXPRT *xprt) "%s: %p fd %d fragment is zero (will set dead)", __func__, xprt, xprt->xp_fd); SVC_DESTROY(xprt); +#ifdef USE_LTTNG_NTIRPC + tracepoint(xprt, recv_exit, __func__, __LINE__, + xprt, "NO RECORD", 0); +#endif /* USE_LTTNG_NTIRPC */ return SVC_STAT(xprt); } +#ifdef USE_LTTNG_NTIRPC + tracepoint(xprt, recv_frag, __func__, __LINE__, + xprt, xd->sx_fbtbc); +#endif /* USE_LTTNG_NTIRPC */ /* one buffer per fragment */ uv = xdr_ioq_uv_create(xd->sx_fbtbc, flags); (xioq->ioq_uv.uvqh.qcount)++; @@ -745,18 +779,29 @@ svc_vc_recv(SVCXPRT *xprt) __warnx(TIRPC_DEBUG_FLAG_SVC_VC, "%s: %p fd %d recv errno %d (try again)", __func__, xprt, xprt->xp_fd, code); - if (unlikely(svc_rqst_rearm_events(xprt))) { + if (unlikely(svc_rqst_rearm_events( + xprt, + SVC_XPRT_FLAG_ADDED_RECV))) { __warnx(TIRPC_DEBUG_FLAG_ERROR, "%s: %p fd %d svc_rqst_rearm_events failed (will set dead)", __func__, xprt, xprt->xp_fd); SVC_DESTROY(xprt); + code = EINVAL; } +#ifdef USE_LTTNG_NTIRPC + tracepoint(xprt, recv_exit, __func__, __LINE__, + xprt, "EAGAIN", code); +#endif /* USE_LTTNG_NTIRPC */ return SVC_STAT(xprt); } __warnx(TIRPC_DEBUG_FLAG_ERROR, "%s: %p fd %d recv errno %d (will set dead)", __func__, xprt, xprt->xp_fd, code); SVC_DESTROY(xprt); +#ifdef USE_LTTNG_NTIRPC + tracepoint(xprt, recv_exit, __func__, __LINE__, + xprt, "ERROR", code); +#endif /* USE_LTTNG_NTIRPC */ return SVC_STAT(xprt); } @@ -765,9 +810,18 @@ svc_vc_recv(SVCXPRT *xprt) "%s: %p fd %d recv closed (will set dead)", __func__, xprt, xprt->xp_fd); SVC_DESTROY(xprt); +#ifdef USE_LTTNG_NTIRPC + tracepoint(xprt, recv_exit, __func__, __LINE__, + xprt, "EMPTY", 0); +#endif /* USE_LTTNG_NTIRPC */ return SVC_STAT(xprt); } +#ifdef USE_LTTNG_NTIRPC + tracepoint(xprt, recv_bytes, __func__, __LINE__, + xprt, xd->sx_fbtbc, rlen); +#endif /* USE_LTTNG_NTIRPC */ + uv->v.vio_tail += rlen; xd->sx_fbtbc -= rlen; @@ -776,12 +830,22 @@ svc_vc_recv(SVCXPRT *xprt) __func__, xprt, xprt->xp_fd, rlen, xd->sx_fbtbc, flags); if (xd->sx_fbtbc || (flags & UIO_FLAG_MORE)) { - if (unlikely(svc_rqst_rearm_events(xprt))) { + if (unlikely(svc_rqst_rearm_events(xprt, + SVC_XPRT_FLAG_ADDED_RECV))) { __warnx(TIRPC_DEBUG_FLAG_ERROR, "%s: %p fd %d svc_rqst_rearm_events failed (will set dead)", __func__, xprt, xprt->xp_fd); SVC_DESTROY(xprt); +#ifndef USE_LTTNG_NTIRPC + } +#else + tracepoint(xprt, recv_exit, __func__, __LINE__, + xprt, "REARM FAILED", -1); + } else { + tracepoint(xprt, recv_exit, __func__, __LINE__, + xprt, "MORE", 0); } +#endif /* USE_LTTNG_NTIRPC */ return SVC_STAT(xprt); } @@ -790,15 +854,23 @@ svc_vc_recv(SVCXPRT *xprt) TAILQ_REMOVE(&rec->ioq.ioq_uv.uvqh.qh, &xioq->ioq_s, q); xdr_ioq_reset(xioq, 0); - if (unlikely(svc_rqst_rearm_events(xprt))) { + if (unlikely(svc_rqst_rearm_events(xprt, SVC_XPRT_FLAG_ADDED_RECV))) { __warnx(TIRPC_DEBUG_FLAG_ERROR, "%s: %p fd %d svc_rqst_rearm_events failed (will set dead)", __func__, xprt, xprt->xp_fd); xdr_ioq_destroy(xioq, xioq->ioq_s.qsize); SVC_DESTROY(xprt); +#ifdef USE_LTTNG_NTIRPC + tracepoint(xprt, recv_exit, __func__, __LINE__, + xprt, "REARM FAILED", -1); +#endif /* USE_LTTNG_NTIRPC */ return SVC_STAT(xprt); } +#ifdef USE_LTTNG_NTIRPC + tracepoint(xprt, recv_exit, __func__, __LINE__, + xprt, "CALLING svc_request", 0); +#endif /* USE_LTTNG_NTIRPC */ return svc_request(xprt, xioq->xdrs); } diff --git a/src/svc_xprt.c b/src/svc_xprt.c index d8d935aed9..d0bc832121 100644 --- a/src/svc_xprt.c +++ b/src/svc_xprt.c @@ -172,6 +172,7 @@ svc_xprt_lookup(int fd, svc_xprt_setup_t setup) } (*setup)(&xprt); /* zalloc, xp_refcnt = 1 */ xprt->xp_fd = fd; + xprt->xp_fd_send = -1; xprt->xp_flags = SVC_XPRT_FLAG_INITIAL; /* Get ref for caller */ From ef6acb2f9291e4cd640de01e207a842bbaec638f Mon Sep 17 00:00:00 2001 From: "Frank S. Filz" Date: Fri, 10 May 2019 15:47:07 -0700 Subject: [PATCH 35/70] Modify gss wrap and unwrap to work with iov functions gss_unwrap_iov still requires a single buffer, but at least it can decrypt in place. gss_mic_verify_iov requires the MIC token to be in a single buffer. If it is not, we extract it, othwewise we use it in place. The data may be in one or more buffers. If for some reason xdr_rpc_gss_wrap is called for PRIVACY on an xdr that does not support XDR_NEBUFS, it will use the old code which makes a copy of the data with a separate wrapbuf and then copies it back into the xdr stream. Since gss_get_mic_iov doesn't need to insert a header, it can work with streams that don't support XDR_NEWBUF since it will just append the MIC token at the end (and conceivably there might be a vector xdr that would put that in a new buffer even though it can't support XDR_NEWBUFS to insert a HEADER buffer). Signed-off-by: Frank S. Filz --- ntirpc/rpc/auth_gss.h | 4 +- src/auth_gss.c | 2 +- src/authgss_prot.c | 902 +++++++++++++++++++++++++++++++++++++----- src/svc_auth_gss.c | 97 +---- src/svc_vc.c | 10 +- 5 files changed, 815 insertions(+), 200 deletions(-) diff --git a/ntirpc/rpc/auth_gss.h b/ntirpc/rpc/auth_gss.h index 354f28b9f7..84bffaae3f 100644 --- a/ntirpc/rpc/auth_gss.h +++ b/ntirpc/rpc/auth_gss.h @@ -105,6 +105,8 @@ struct rpc_gss_init_res { /* Maximum sequence number value. */ #define RPCSEC_GSS_MAXSEQ 0x80000000 +typedef void (*checksum_func_t) (void *priv, void *databuf, size_t length); + /* Prototypes. */ __BEGIN_DECLS bool xdr_rpc_gss_cred(XDR *xdrs, struct rpc_gss_cred *p); @@ -115,7 +117,7 @@ bool xdr_rpc_gss_wrap(XDR *xdrs, xdrproc_t xdr_func, void *xdr_ptr, u_int seq); bool xdr_rpc_gss_unwrap(XDR *xdrs, xdrproc_t xdr_func, void *xdr_ptr, gss_ctx_id_t ctx, gss_qop_t qop, rpc_gss_svc_t svc, - u_int seq); + u_int seq, checksum_func_t checksum_func, void *priv); bool xdr_rpc_gss_encode(XDR *xdrs, gss_buffer_t buf, u_int maxsize); bool xdr_rpc_gss_decode(XDR *xdrs, gss_buffer_t buf); diff --git a/src/auth_gss.c b/src/auth_gss.c index 2d90d07eb5..654d1ed0cf 100644 --- a/src/auth_gss.c +++ b/src/auth_gss.c @@ -642,5 +642,5 @@ authgss_unwrap(AUTH *auth, XDR *xdrs, xdrproc_t xdr_func, void *xdr_ptr) return (xdr_rpc_gss_unwrap (xdrs, xdr_func, xdr_ptr, gd->ctx, gd->sec.qop, gd->sec.svc, - gd->gc.gc_seq)); + gd->gc.gc_seq, NULL, NULL)); } diff --git a/src/authgss_prot.c b/src/authgss_prot.c index eba71b6b1b..2786105510 100644 --- a/src/authgss_prot.c +++ b/src/authgss_prot.c @@ -45,10 +45,12 @@ #include #include #include +#include /* additional space needed for encoding */ #define RPC_SLACK_SPACE 1024 #define AUTHGSS_MAX_TOKEN_SIZE 24576 /* default MS PAC is 12000 bytes */ +#define MAXALLOCA (256) bool xdr_rpc_gss_encode(XDR *xdrs, gss_buffer_t buf, u_int maxsize) @@ -169,32 +171,430 @@ xdr_rpc_gss_init_res(XDR *xdrs, struct rpc_gss_init_res *p) return (xdr_stat); } +void +gss_log_error(char *m, OM_uint32 maj_stat, OM_uint32 min_stat) +{ + OM_uint32 min; + gss_buffer_desc msg1, msg2; + int msg_ctx = 0; + + gss_display_status(&min, maj_stat, GSS_C_GSS_CODE, GSS_C_NULL_OID, + &msg_ctx, &msg1); + + gss_display_status(&min, min_stat, GSS_C_MECH_CODE, GSS_C_NULL_OID, + &msg_ctx, &msg2); + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, "rpcsec_gss: %s: %s - %s\n", + m, (char *)msg1.value, (char *)msg2.value); + gss_release_buffer(&min, &msg1); + gss_release_buffer(&min, &msg2); +} + +void show_gss_xdr_iov(gss_iov_buffer_desc *gss_iov, int gss_count, + xdr_vio *xdr_iov, int xdr_count, const char *desc) +{ + int i; + + /* Now show the gss_iov */ + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, + "Show the gss_iov %s %p", desc, gss_iov); + + for (i = 0; i < gss_count; i++) { + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, + "buf %d type %d length %d value %p", + i, gss_iov[i].type, gss_iov[i].buffer.length, + gss_iov[i].buffer.value); + } + + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, + "Show the xdr_iov %s %p", desc, xdr_iov); + + if (xdr_iov == NULL) + return; + + for (i = 0; i < xdr_count; i++) { + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, + "buf %d type %d (base %p head %p tail %p wrap %p) length %lu", + i, xdr_iov[i].vio_type, + xdr_iov[i].vio_base, xdr_iov[i].vio_head, + xdr_iov[i].vio_tail, xdr_iov[i].vio_wrap, + (unsigned long)(xdr_iov[i].vio_tail - + xdr_iov[i].vio_head)); + } +} + bool xdr_rpc_gss_wrap(XDR *xdrs, xdrproc_t xdr_func, void *xdr_ptr, gss_ctx_id_t ctx, gss_qop_t qop, rpc_gss_svc_t svc, u_int seq) { gss_buffer_desc databuf, wrapbuf; OM_uint32 maj_stat, min_stat; - int start, end, conf_state; - bool xdr_stat; + int start, end, conf_state, iov_count, data_count, after_data, i; + bool xdr_stat, vector; u_int databuflen, maxwrapsz; + gss_iov_buffer_desc *gss_iov = NULL; + xdr_vio *xdr_iov = NULL, *data; + u_int32_t xvsize = 0, gvsize = 0; - /* Write dummy for databody length. */ + if (svc != RPCSEC_GSS_SVC_PRIVACY && + svc != RPCSEC_GSS_SVC_INTEGRITY) { + /* For some reason we got here with not supported type. */ + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, + "%s() svc != RPCSEC_GSS_SVC_PRIVACY or RPCSEC_GSS_SVC_INTEGRITY", + __func__); + return (FALSE); + } + + /* Write dummy for databody length. The length will be filled in later. + * - For RPCSEC_GSS_SVC_PRIVACY the length will include the whole + * result of gss_wrap. + * - For RPCSEC_GSS_SVC_INTEGRITY the length will just be the response + * data length. + * No matter what type or how we process, we will come back and fill + * the length in exactly here. + */ start = XDR_GETPOS(xdrs); databuflen = 0xaaaaaaaa; /* should always overwrite */ - if (!XDR_PUTUINT32(xdrs, databuflen)) + if (!XDR_PUTUINT32(xdrs, databuflen)) { + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, + "%s() could not put databuflen", + __func__); return (FALSE); + } - memset(&databuf, 0, sizeof(databuf)); - memset(&wrapbuf, 0, sizeof(wrapbuf)); - - /* Marshal rpc_gss_data_t (sequence number + arguments). */ - if (!XDR_PUTUINT32(xdrs, seq) || !(*xdr_func) (xdrs, xdr_ptr)) + /* If we are doing PRIVACY, determine if XDR is a vector or not. + * INTEGRITY can work with non-vector xdrs like xdrmem because the + * MIC token will just be appended at the end. + * If it's privacy, and NEWBUF is supported (because xdrs is a vector) + * then NEWBUF will have allocated the new buffer. + */ + vector = (svc == RPCSEC_GSS_SVC_INTEGRITY) || XDR_NEWBUF(xdrs); + + /* Marshal rpc_gss_data_t (sequence number + arguments). + * If it's a vector, the response has been marshalled into a new + * buffer so that we will be able to insert any header. + */ + if (!XDR_PUTUINT32(xdrs, seq) || !(*xdr_func) (xdrs, xdr_ptr)) { + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, + "%s() could not enocde rpc_gss_data_t", + __func__); return (FALSE); + } + end = XDR_GETPOS(xdrs); + databuflen = end - start - 4; + + if (vector) { + /* Now we have the response encoded, time to build out iov for + * gss_get_mic_iov or gss_wrap_iov. + * + * vsize = ioq count + 2 (for header and trailer) + */ + data_count = XDR_IOVCOUNT(xdrs, start + 4, databuflen); + + if (data_count < 0) { + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, + "%s() data_count = %d", + __func__, data_count); + return (FALSE); + } + + if (svc == RPCSEC_GSS_SVC_INTEGRITY) { + /* Add a trailer buffer for the MIC */ + iov_count = data_count + 1; + after_data = data_count; + } else if (svc == RPCSEC_GSS_SVC_PRIVACY) { + /* Add header, padding, and trailer for the wrap */ + iov_count = data_count + 3; + after_data = data_count + 1; + } + + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, + "data_count=%d, iov_count=%d, after_data=%d", + data_count, iov_count, after_data); + + /* Determine the size of the gss_iov */ + gvsize = iov_count * sizeof(gss_iov_buffer_desc); + xvsize = iov_count * sizeof(xdr_vio); + + /* Allocate the gss_iov */ + if (unlikely(gvsize > MAXALLOCA)) { + gss_iov = mem_alloc(gvsize); + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, + "mem_alloc gss_iov=%p size %llu count %d", + gss_iov, (unsigned long long) gvsize, + iov_count); + } else { + gss_iov = alloca(gvsize); + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, + "alloca gss_iov=%p size %llu count %d", + gss_iov, (unsigned long long) gvsize, + iov_count); + } + + /* Allocate the xdr_iov */ + if (unlikely(xvsize > MAXALLOCA)) { + xdr_iov = mem_alloc(xvsize); + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, + "mem_alloc xdr_iov=%p size %llu count %d", + xdr_iov, (unsigned long long) xvsize, + iov_count); + } else { + xdr_iov = alloca(xvsize); + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, + "alloca xdr_iov=%p size %llu count %d", + xdr_iov, (unsigned long long) xvsize, + iov_count); + } + + memset(gss_iov, 0, gvsize); + memset(xdr_iov, 0, xvsize); + + /* Point to where the first buffer in the data will be. */ + data = &xdr_iov[(svc == RPCSEC_GSS_SVC_PRIVACY) ? 1 : 0]; + + /* Now fill in the data buffers + * vector is empty on entry + * DATA buffers are completely filled (vio_base, vio_head, + * vio_tail, vio_wrap, vio_length, and vio_type) on exit. + * No other buffers are touched at this point. + */ + xdr_stat = XDR_FILLBUFS(xdrs, start + 4, data, databuflen); + + /* Now show the gss_iov and xdr_iov */ + show_gss_xdr_iov(gss_iov, iov_count, xdr_iov, iov_count, + "after XDR_FILLBUFS"); + + /* Now set up the gss_iov */ + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, "Set up gss_iov"); + for (i = 0; i < iov_count; i++) { + if (i == 0 && svc == RPCSEC_GSS_SVC_PRIVACY) { + /* Fill in HEADER buffer */ + gss_iov[i].type = GSS_IOV_BUFFER_TYPE_HEADER; + } else if (i < after_data) { + /* Copy over a DATA buffer */ + gss_iov[i].type = GSS_IOV_BUFFER_TYPE_DATA; + gss_iov[i].buffer.length = + xdr_iov[i].vio_length; + gss_iov[i].buffer.value = + xdr_iov[i].vio_head; + } else if (svc == RPCSEC_GSS_SVC_INTEGRITY) { + /* Set up TRAILER buffer for INTEGRITY*/ + gss_iov[i].type = GSS_IOV_BUFFER_TYPE_MIC_TOKEN; + } else if (i == after_data) { + /* Set up PADDING buffer for PRIVACY*/ + gss_iov[i].type = GSS_IOV_BUFFER_TYPE_PADDING; + } else { + /* Set up TRAILER buffer for PRIVACY*/ + gss_iov[i].type = GSS_IOV_BUFFER_TYPE_TRAILER; + } + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, + "buf %d type %d length %d value %p", + i, gss_iov[i].type, gss_iov[i].buffer.length, + gss_iov[i].buffer.value); + } + + /* Now show the gss_iov and xdr_iov */ + show_gss_xdr_iov(gss_iov, iov_count, xdr_iov, iov_count, + "after setting up gss_iov"); + + /* At this point gss_iov HEADER, PADDING, and TRAILER have + * type set and buffer is empty. + * DATA is completely filled in. + * xdr_iov DATA buffers are completely filled in. + * xdr_iov HEADER and TRAILER buffers are empty. + */ + + if (svc == RPCSEC_GSS_SVC_INTEGRITY) { + /* Now call gss_get_mic_iov_length */ + maj_stat = gss_get_mic_iov_length(&min_stat, ctx, qop, + gss_iov, iov_count); + + if (maj_stat != GSS_S_COMPLETE) { + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, + "%s() gss_get_mic_iov_length failed", + __func__); + gss_log_error("gss_get_mic_iov_length", + maj_stat, min_stat); + xdr_stat = FALSE; + goto out; + } + + /* Copy the TRAILER buffer length into the xdr_iov */ + xdr_iov[after_data].vio_length = + gss_iov[after_data].buffer.length; + xdr_iov[after_data].vio_type = VIO_TRAILER_LEN; + + /* Marshal databody_integ length. Note tha this will + * leave the cursor position at start + 4 but the + * forthcoming XDR_ALLOCHDRS is going to fix the + * cursor position to the end of everything. + */ + if (!XDR_SETPOS(xdrs, start)) { + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, + "%s() XDR_SETPOS #2 failed", + __func__); + return (FALSE); + } + + if (!XDR_PUTUINT32(xdrs, databuflen)) + return (FALSE); + } else { + u_int databody_priv_len; + + /* Now call gss_wrap_iov_length */ + maj_stat = gss_wrap_iov_length(&min_stat, ctx, true, + qop, GSS_C_QOP_DEFAULT, + gss_iov, iov_count); + + if (maj_stat != GSS_S_COMPLETE) { + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, + "%s() gss_wrap_iov_length failed", + __func__); + gss_log_error("gss_wrap_iov_length", + maj_stat, min_stat); + xdr_stat = FALSE; + goto out; + } + + /* Copy the HEADER buffer length into the xdr_iov */ + xdr_iov[0].vio_length = gss_iov[0].buffer.length; + xdr_iov[0].vio_type = VIO_HEADER; + + /* Copy the PADDING buffer length into the xdr_iov */ + xdr_iov[after_data].vio_length = + gss_iov[after_data].buffer.length; + xdr_iov[after_data].vio_type = VIO_TRAILER; + + /* Copy the TRAILER buffer length into the xdr_iov */ + xdr_iov[after_data + 1].vio_length = + gss_iov[after_data + 1].buffer.length; + xdr_iov[after_data + 1].vio_type = VIO_TRAILER; + + /* Compute the databody_priv length as sum of + * the databuflen and the HEADER, PADDING, and + * TRAILER buffers. + */ + databody_priv_len = databuflen + + gss_iov[0].buffer.length + + gss_iov[after_data].buffer.length + + gss_iov[after_data + 1].buffer.length; + + /* Marshal databody_priv length. Note tha this will + * leave the cursor position at start + 4 but the + * forthcoming XDR_ALLOCHDRS is going to fix the + * cursor position to the end of everything. + */ + if (!XDR_SETPOS(xdrs, start)) { + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, + "%s() XDR_SETPOS #2 failed", + __func__); + return (FALSE); + } + + if (!XDR_PUTUINT32(xdrs, databody_priv_len)) { + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, + "%s() XDR_PUTUINT32 databody_priv_len failed", + __func__); + return (FALSE); + } + } + + /* Now show the gss_iov and xdr_iov */ + show_gss_xdr_iov(gss_iov, iov_count, xdr_iov, iov_count, + "after gss_...length"); + + /* At this point: + * The xdr_iov DATA buffers are completely filled in. + * The xdr_iov HEADER and TRAILER buffers have type and length + * filled in. + */ + + /* Now actually allocate the HEADER, PADDING, and TRAILER. + * The cursor position will be updated to the end of the + * TRAILER. + */ + xdr_stat = XDR_ALLOCHDRS(xdrs, start + 4, xdr_iov, iov_count); + + if (!xdr_stat) + goto out; + + /* Now show the gss_iov and xdr_iov */ + show_gss_xdr_iov(gss_iov, iov_count, xdr_iov, iov_count, + "after XDR_ALLOCHDRS"); + + /* At this point the xdr_iov is completely filled in. */ + + if (svc == RPCSEC_GSS_SVC_INTEGRITY) { + /* Copy the TRAILER buffer into the gss_iov */ + gss_iov[after_data].buffer.value = + xdr_iov[after_data].vio_head; + + /* At this point the gss_iov is completely filled in */ + + /* Now show the gss_iov and xdr_iov */ + show_gss_xdr_iov(gss_iov, iov_count, xdr_iov, iov_count, + "just before gss_get_mic_iov"); + + /* Now call gss_get_mic_iov */ + maj_stat = gss_get_mic_iov(&min_stat, ctx, qop, + gss_iov, iov_count); + + if (maj_stat != GSS_S_COMPLETE) { + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, + "%s() gss_get_mic_iov failed", + __func__); + gss_log_error("gss_get_mic_iov", + maj_stat, min_stat); + xdr_stat = FALSE; + goto out; + } + } else { + /* Copy the HEADER buffer into the gss_iov */ + gss_iov[0].buffer.value = xdr_iov[0].vio_head; + + /* Copy the PADDING buffer into the gss_iov */ + gss_iov[after_data].buffer.value = + xdr_iov[after_data].vio_head; + + /* Copy the TRAILER buffer into the gss_iov */ + gss_iov[after_data + 1].buffer.value = + xdr_iov[after_data + 1].vio_head; + + /* At this point the gss_iov is completely filled in */ + + /* Now show the gss_iov and xdr_iov */ + show_gss_xdr_iov(gss_iov, iov_count, xdr_iov, iov_count, + "just before gss_wrap_iov"); + + /* Now call gss_wrap_iov */ + maj_stat = gss_wrap_iov(&min_stat, ctx, true, + GSS_C_QOP_DEFAULT, NULL, + gss_iov, iov_count); + + if (maj_stat != GSS_S_COMPLETE) { + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, + "%s() gss_wrap_iov failed", + __func__); + gss_log_error("gss_wrap_iov", + maj_stat, min_stat); + xdr_stat = FALSE; + goto out; + } + } + + /* At this point, the xdr_iov now has all the GSS data in it + * and wrapping is complete. Now we need to go back and write + * the length back at start. + */ + + goto out; + } /* else fall through to legacy single buffer implementation */ + + /* Initialize the static buffers */ + memset(&databuf, 0, sizeof(databuf)); + memset(&wrapbuf, 0, sizeof(wrapbuf)); /* Set databuf to marshalled rpc_gss_data_t. */ - databuflen = end - start - 4; if (!XDR_SETPOS(xdrs, start+4)) { __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, "%s() XDR_SETPOS #1 failed", @@ -211,140 +611,415 @@ xdr_rpc_gss_wrap(XDR *xdrs, xdrproc_t xdr_func, void *xdr_ptr, return (FALSE); } - xdr_stat = FALSE; + /* We only need the legacy inplementation for RPCSEC_GSS_SVC_PRIVACY */ - if (svc == RPCSEC_GSS_SVC_INTEGRITY) { - /* Marshal databody_integ length. */ - if (!XDR_SETPOS(xdrs, start)) { - __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, - "%s() XDR_SETPOS #2 failed", - __func__); - return (FALSE); - } - if (!XDR_PUTUINT32(xdrs, databuflen)) - return (FALSE); + /* Encrypt rpc_gss_data_t. */ + maj_stat = gss_wrap(&min_stat, ctx, TRUE, qop, &databuf, &conf_state, + &wrapbuf); - /* Checksum rpc_gss_data_t. */ - maj_stat = gss_get_mic(&min_stat, ctx, qop, &databuf, &wrapbuf); - if (maj_stat != GSS_S_COMPLETE) { - __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, - "%s() gss_get_mic failed", - __func__); - return (FALSE); - } - /* Marshal checksum. */ - if (!XDR_SETPOS(xdrs, end)) { - __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, - "%s() XDR_SETPOS #3 failed", - __func__); - gss_release_buffer(&min_stat, &wrapbuf); - return (FALSE); - } - maxwrapsz = (u_int) (wrapbuf.length + RPC_SLACK_SPACE); - xdr_stat = xdr_rpc_gss_encode(xdrs, &wrapbuf, maxwrapsz); - gss_release_buffer(&min_stat, &wrapbuf); - } else if (svc == RPCSEC_GSS_SVC_PRIVACY) { - /* Encrypt rpc_gss_data_t. */ - maj_stat = - gss_wrap(&min_stat, ctx, TRUE, qop, &databuf, &conf_state, - &wrapbuf); - if (maj_stat != GSS_S_COMPLETE) { - gss_log_status("gss_wrap", maj_stat, min_stat); - return (FALSE); - } - /* Marshal databody_priv. */ - if (!XDR_SETPOS(xdrs, start)) { - __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, - "%s() XDR_SETPOS #4 failed", - __func__); - gss_release_buffer(&min_stat, &wrapbuf); - return (FALSE); - } - maxwrapsz = (u_int) (wrapbuf.length + RPC_SLACK_SPACE); - xdr_stat = xdr_rpc_gss_encode(xdrs, &wrapbuf, maxwrapsz); + if (maj_stat != GSS_S_COMPLETE) { + gss_log_status("gss_wrap", maj_stat, min_stat); + return (FALSE); + } + + /* Marshal databody_priv. */ + if (!XDR_SETPOS(xdrs, start)) { + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, + "%s() XDR_SETPOS #4 failed", + __func__); gss_release_buffer(&min_stat, &wrapbuf); + return (FALSE); } + + maxwrapsz = (u_int) (wrapbuf.length + RPC_SLACK_SPACE); + xdr_stat = xdr_rpc_gss_encode(xdrs, &wrapbuf, maxwrapsz); + gss_release_buffer(&min_stat, &wrapbuf); + if (!xdr_stat) { __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, "%s() failed", __func__); } + +out: + + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, + "check free gss_iov=%p size %llu", + gss_iov, (unsigned long long) gvsize); + + if (unlikely(gvsize > MAXALLOCA)) { + mem_free(gss_iov, gvsize); + } + + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, + "check free gss_iov=%p size %llu", + xdr_iov, (unsigned long long) xvsize); + + if (unlikely(xvsize > MAXALLOCA)) { + mem_free(xdr_iov, xvsize); + } + return (xdr_stat); } bool xdr_rpc_gss_unwrap(XDR *xdrs, xdrproc_t xdr_func, void *xdr_ptr, gss_ctx_id_t ctx, gss_qop_t qop, rpc_gss_svc_t svc, - u_int seq) + u_int seq, checksum_func_t checksum_func, void *priv) { - XDR tmpxdrs; - gss_buffer_desc databuf, wrapbuf; + XDR tmpxdrs, *usexdrs = xdrs; OM_uint32 maj_stat, min_stat; - u_int qop_state; - int conf_state; - uint32_t seq_num; + u_int qop_state, data_start, token_start, buffer_len = 0; + int conf_state, iov_count, token_iov_count, i; + uint32_t seq_num, xvsize = 0, gvsize = 0, data_len, token_len; bool xdr_stat; + gss_iov_buffer_desc *gss_iov = NULL; + xdr_vio *xdr_iov = NULL; + char *buffer = NULL; + + if (svc != RPCSEC_GSS_SVC_PRIVACY && + svc != RPCSEC_GSS_SVC_INTEGRITY) { + /* For some reason we got here with not supported type. */ + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, + "%s() svc != RPCSEC_GSS_SVC_PRIVACY or RPCSEC_GSS_SVC_INTEGRITY", + __func__); + return (FALSE); + } if (xdr_func == (xdrproc_t) xdr_void || xdr_ptr == NULL) return (TRUE); - memset(&databuf, 0, sizeof(databuf)); - memset(&wrapbuf, 0, sizeof(wrapbuf)); - if (svc == RPCSEC_GSS_SVC_INTEGRITY) { - /* Decode databody_integ. */ - if (!xdr_rpc_gss_decode(xdrs, &databuf)) { + /* + * first deal with the data length since xdr bytes are counted + */ + if (!XDR_GETUINT32(xdrs, &data_len)) { + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, + "%s:%u ERROR size", + __func__, __LINE__); + return (FALSE); + } + data_start = XDR_GETPOS(xdrs); + iov_count = XDR_IOVCOUNT(xdrs, data_start, data_len); + if (iov_count < 0) { __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, - "%s() xdr_rpc_gss_decode databody_integ failed", + "%s() XDR_IOVCOUNT signed data failed", __func__); return (FALSE); } - /* Decode checksum. */ - if (!xdr_rpc_gss_decode(xdrs, &wrapbuf)) { - gss_release_buffer(&min_stat, &databuf); + if (!XDR_SETPOS(xdrs, data_start + data_len)) { __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, - "%s() xdr_rpc_gss_decode checksum failed", + "%s() XDR_SETPOS failed", __func__); return (FALSE); } - /* Verify checksum and QOP. */ - maj_stat = - gss_verify_mic(&min_stat, ctx, &databuf, &wrapbuf, - &qop_state); - gss_release_buffer(&min_stat, &wrapbuf); + if (!XDR_GETUINT32(xdrs, &token_len)) { + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, + "%s:%u ERROR size", + __func__, __LINE__); + return (FALSE); + } + token_start = XDR_GETPOS(xdrs); + token_iov_count = XDR_IOVCOUNT(xdrs, token_start, token_len); + if (token_iov_count < 0) { + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, + "%s() XDR_IOVCOUNT MIC token failed", + __func__); + return (FALSE); + } + + /* Determine the size of the gss_iov and xdr_iov. */ + gvsize = (iov_count + 1) * sizeof(gss_iov_buffer_desc); + xvsize = (iov_count + 1) * sizeof(xdr_vio); + + /* Allocate the gss_iov */ + if (unlikely(gvsize > MAXALLOCA)) { + gss_iov = mem_alloc(gvsize); + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, + "mem_alloc gss_iov=%p size %llu count %d", + gss_iov, (unsigned long long) gvsize, + iov_count); + } else { + gss_iov = alloca(gvsize); + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, + "alloca gss_iov=%p size %llu count %d", + gss_iov, (unsigned long long) gvsize, + iov_count); + } + + /* Allocate the xdr_iov */ + if (unlikely(xvsize > MAXALLOCA)) { + xdr_iov = mem_alloc(xvsize); + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, + "mem_alloc xdr_iov=%p size %llu count %d", + xdr_iov, (unsigned long long) xvsize, + iov_count); + } else { + xdr_iov = alloca(xvsize); + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, + "alloca xdr_iov=%p size %llu count %d", + xdr_iov, (unsigned long long) xvsize, + iov_count); + } + + memset(gss_iov, 0, gvsize); + memset(xdr_iov, 0, xvsize); + + if (token_iov_count != 1) { + /* We need to allocate a token buffer */ + buffer = mem_alloc(token_len); + buffer_len = token_len; + gss_iov[iov_count].type = GSS_IOV_BUFFER_TYPE_MIC_TOKEN; + gss_iov[iov_count].buffer.length = token_len; + gss_iov[iov_count].buffer.value = buffer; + + /* Now extract the MIC token into the buffer */ + if (!xdr_opaque_decode(xdrs, buffer, token_len)) { + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, + "%s() xdr_opaque_decode MIC failed", + __func__); + xdr_stat = FALSE; + goto out; + } + } else { + if (!XDR_FILLBUFS(xdrs, token_start, + &xdr_iov[iov_count], token_len)) { + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, + "%s() XDR_FILLBUFS MIC token failed", + __func__); + xdr_stat = FALSE; + goto out; + } + gss_iov[iov_count].type = GSS_IOV_BUFFER_TYPE_MIC_TOKEN; + gss_iov[iov_count].buffer.length = token_len; + gss_iov[iov_count].buffer.value = + xdr_iov[iov_count].vio_head; + /* Consume the MIC token */ + if (!XDR_SETPOS(xdrs, token_start + token_len)) { + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, + "%s() XDR_SETPOS failed %lu %lu %lu", + __func__, + (unsigned long) token_start, + (unsigned long) token_len, + (unsigned long) token_start + + token_len); + xdr_stat = FALSE; + goto out; + } + } + + /* Now show the gss_iov and xdr_iov */ + show_gss_xdr_iov(gss_iov, iov_count + 1, xdr_iov, iov_count + 1, + "just before XDR_FILLBUFS for data buffers"); + + /* Now fill in the data buffers + * DATA buffers are completely filled (vio_base, vio_head, + * vio_tail, vio_wrap, vio_length, and vio_type) on exit. + */ + if (!XDR_FILLBUFS(xdrs, data_start, &xdr_iov[0], data_len)) { + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, + "%s() XDR_FILLBUFS integrity data failed", + __func__); + xdr_stat = FALSE; + goto out; + } + + /* Now show the gss_iov and xdr_iov */ + show_gss_xdr_iov(gss_iov, iov_count + 1, xdr_iov, iov_count + 1, + "just after XDR_FILLBUFS for data buffers"); + + /* Now set up the gss_iov */ + for (i = 0; i < iov_count; i++) { + /* Copy over a DATA buffer */ + gss_iov[i].type = GSS_IOV_BUFFER_TYPE_DATA; + gss_iov[i].buffer.length = xdr_iov[i].vio_length; + gss_iov[i].buffer.value = xdr_iov[i].vio_head; + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, + "buf %d type %d length %d value %p", + i, gss_iov[i].type, gss_iov[i].buffer.length, + gss_iov[i].buffer.value); + } + + /* Now show the gss_iov and xdr_iov */ + show_gss_xdr_iov(gss_iov, iov_count + 1, xdr_iov, iov_count + 1, + "just before gss_verify_mic_iov"); + + maj_stat = gss_verify_mic_iov(&min_stat, ctx, &qop_state, + gss_iov, iov_count + 1); if (maj_stat != GSS_S_COMPLETE || qop_state != qop) { - gss_release_buffer(&min_stat, &databuf); - gss_log_status("gss_verify_mic", maj_stat, min_stat); - return (FALSE); + gss_log_error("gss_verify_mic_iov", + maj_stat, min_stat); + xdr_stat = FALSE; + goto out; } - } else if (svc == RPCSEC_GSS_SVC_PRIVACY) { - /* Decode databody_priv. */ - if (!xdr_rpc_gss_decode(xdrs, &wrapbuf)) { + + /* Now we have verified. The data is still in place so we can + * decode the actual request from the original xdrs, so position + * to data_start so decode can begin. + */ + if (!XDR_SETPOS(xdrs, data_start)) { __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, - "%s() xdr_rpc_gss_decode databody_priv failed", + "%s() XDR_SETPOS to veriefied data start failed", __func__); + xdr_stat = FALSE; + goto out; + } + } else if (svc == RPCSEC_GSS_SVC_PRIVACY) { + /* + * first deal with the token length since xdr bytes are counted + * token_start and token_len refer to the entire wrapped package + */ + if (!XDR_GETUINT32(xdrs, &token_len)) { + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, + "%s:%u ERROR size", + __func__, __LINE__); return (FALSE); } - /* Decrypt databody. */ - maj_stat = - gss_unwrap(&min_stat, ctx, &wrapbuf, &databuf, &conf_state, - &qop_state); - gss_release_buffer(&min_stat, &wrapbuf); + token_start = XDR_GETPOS(xdrs); + iov_count = XDR_IOVCOUNT(xdrs, token_start, token_len); - /* Verify encryption and QOP. */ - if (maj_stat != GSS_S_COMPLETE || qop_state != qop - || conf_state != TRUE) { - gss_release_buffer(&min_stat, &databuf); - gss_log_status("gss_unwrap", maj_stat, min_stat); + if (iov_count < 0) { + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, + "%s() XDR_IOVCOUNT privacy data failed", + __func__); return (FALSE); } + + /* Determine the size of the gss_iov and xdr_iov. + * NOTE: we only need a single xdr_iov buffer. + * The gss_iov will always be 2: STREAM, DATA + */ + gvsize = 2 * sizeof(gss_iov_buffer_desc); + xvsize = sizeof(xdr_vio); + + /* Allocate the gss_iov */ + if (unlikely(gvsize > MAXALLOCA)) { + gss_iov = mem_alloc(gvsize); + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, + "mem_alloc gss_iov=%p size %llu count %d", + gss_iov, (unsigned long long) gvsize, + iov_count); + } else { + gss_iov = alloca(gvsize); + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, + "alloca gss_iov=%p size %llu count %d", + gss_iov, (unsigned long long) gvsize, + iov_count); + } + + /* Allocate the xdr_iov */ + if (unlikely(xvsize > MAXALLOCA)) { + xdr_iov = mem_alloc(xvsize); + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, + "mem_alloc xdr_iov=%p size %llu count %d", + xdr_iov, (unsigned long long) xvsize, + iov_count); + } else { + xdr_iov = alloca(xvsize); + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, + "alloca xdr_iov=%p size %llu count %d", + xdr_iov, (unsigned long long) xvsize, + iov_count); + } + + memset(gss_iov, 0, gvsize); + memset(xdr_iov, 0, xvsize); + + gss_iov[0].type = GSS_IOV_BUFFER_TYPE_STREAM; + gss_iov[1].type = GSS_IOV_BUFFER_TYPE_DATA; + + if (iov_count == 1) { + /* We can unwrap in place */ + if (!XDR_FILLBUFS(xdrs, token_start, + &xdr_iov[0], token_len)) { + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, + "%s() XDR_FILLBUFS wrap token failed", + __func__); + xdr_stat = FALSE; + goto out; + } + + gss_iov[0].buffer.length = xdr_iov[0].vio_length; + gss_iov[0].buffer.value = xdr_iov[0].vio_head; + } else { + /* We need to extract into a single buffer to unwrap */ + buffer = mem_alloc(token_len); + buffer_len = token_len; + gss_iov[0].buffer.length = token_len; + gss_iov[0].buffer.value = buffer; + + /* Now extract the wrap token into the buffer */ + if (!xdr_opaque_decode(xdrs, buffer, token_len)) { + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, + "%s() xdr_opaque_decode wrap token failed", + __func__); + xdr_stat = FALSE; + goto out; + } + } + + /* Now show the gss_iov and xdr_iov */ + show_gss_xdr_iov(gss_iov, 2, xdr_iov, 1, + "just before gss_unwrap_iov"); + + /* Now we have the wrap token in the STREAM buffer */ + maj_stat = gss_unwrap_iov(&min_stat, ctx, &conf_state, + &qop_state, gss_iov, 2); + + if (maj_stat != GSS_S_COMPLETE || qop_state != qop) { + gss_log_error("gss_unwrap_iov", maj_stat, min_stat); + xdr_stat = FALSE; + goto out; + } + + /* Now show the gss_iov and xdr_iov */ + show_gss_xdr_iov(gss_iov, 2, xdr_iov, 1, + "just after gss_unwrap_iov"); + + if (iov_count == 1) { + /* We can decode in place, find the data_start by + * determining the offset within the STREAM + * that gss_unwrap_iov indicated via the DATA buffer + * pointer. + */ + data_start = token_start + + gss_iov[1].buffer.value - + gss_iov[0].buffer.value; + data_len = gss_iov[1].buffer.length; + + if (!XDR_SETPOS(xdrs, data_start)) { + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, + "%s() XDR_SETPOS to veriefied data start failed", + __func__); + xdr_stat = FALSE; + goto out; + } + } else { + /* We need to create an xdrmem from the DATA buffer */ + xdrmem_create(&tmpxdrs, gss_iov[1].buffer.value, + gss_iov[1].buffer.length, XDR_DECODE); + usexdrs = &tmpxdrs; + } } + + /* At this point, usexdrs has been set either to the original xdrs + * up front, or due to the need to unwrap a multi-buffer token, has + * been set to &tmpxdrs. + */ + + /* If checksum is requested perform it. */ + if (checksum_func != NULL) { + checksum_func(priv, usexdrs->x_data, xdr_size_inline(usexdrs)); + } + /* Decode rpc_gss_data_t (sequence number + arguments). */ - xdrmem_create(&tmpxdrs, databuf.value, databuf.length, XDR_DECODE); - xdr_stat = (XDR_GETUINT32(&tmpxdrs, &seq_num) - && (*xdr_func) (&tmpxdrs, xdr_ptr)); - XDR_DESTROY(&tmpxdrs); - gss_release_buffer(&min_stat, &databuf); + xdr_stat = (XDR_GETUINT32(usexdrs, &seq_num) + && (*xdr_func) (usexdrs, xdr_ptr)); + + if (usexdrs == &tmpxdrs) { + /* If it's the tmpxdrs, then destroy the xdrmem we created. */ + XDR_DESTROY(&tmpxdrs); + } /* Verify sequence number. */ if (xdr_stat == TRUE && seq_num != seq) { @@ -353,6 +1028,29 @@ xdr_rpc_gss_unwrap(XDR *xdrs, xdrproc_t xdr_func, void *xdr_ptr, __func__); return (FALSE); } + +out: + + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, + "check free gss_iov=%p size %llu", + gss_iov, (unsigned long long) gvsize); + + if (unlikely(gvsize > MAXALLOCA)) { + mem_free(gss_iov, gvsize); + } + + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, + "check free gss_iov=%p size %llu", + xdr_iov, (unsigned long long) xvsize); + + if (unlikely(xvsize > MAXALLOCA)) { + mem_free(xdr_iov, xvsize); + } + + if (buffer != NULL) { + mem_free(buffer, buffer_len); + } + return (xdr_stat); } diff --git a/src/svc_auth_gss.c b/src/svc_auth_gss.c index 3fc221bbb5..ae9052c70a 100644 --- a/src/svc_auth_gss.c +++ b/src/svc_auth_gss.c @@ -740,98 +740,17 @@ svcauth_gss_unwrap(struct svc_req *req) mutex_lock(&gd->lock); result = xdr_rpc_gss_unwrap(req->rq_xdrs, req->rq_msg.rm_xdr.proc, req->rq_msg.rm_xdr.where, gd->ctx, - gd->sec.qop, gd->sec.svc, gc_seq); + gd->sec.qop, gd->sec.svc, gc_seq, + NULL, NULL); mutex_unlock(&gd->lock); return (result); } -static inline bool -xdr_rpc_gss_checksum(struct svc_req *req, gss_ctx_id_t ctx, gss_qop_t qop, - rpc_gss_svc_t svc, u_int seq) +void svcauth_gss_svc_checksum(void *priv, void *databuf, size_t length) { - XDR *xdrs = req->rq_xdrs; - XDR tmpxdrs; - gss_buffer_desc databuf, wrapbuf; - OM_uint32 maj_stat, min_stat; - u_int qop_state; - int conf_state; - uint32_t seq_num; - bool xdr_stat; - - if (req->rq_msg.rm_xdr.proc == (xdrproc_t) xdr_void - || req->rq_msg.rm_xdr.where == NULL) - return (TRUE); - - memset(&databuf, 0, sizeof(databuf)); - memset(&wrapbuf, 0, sizeof(wrapbuf)); - - if (svc == RPCSEC_GSS_SVC_INTEGRITY) { - /* Decode databody_integ. */ - if (!xdr_rpc_gss_decode(xdrs, &databuf)) { - __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, - "%s() xdr_rpc_gss_decode databody_integ failed", - __func__); - return (FALSE); - } - /* Decode checksum. */ - if (!xdr_rpc_gss_decode(xdrs, &wrapbuf)) { - gss_release_buffer(&min_stat, &databuf); - __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, - "%s() xdr_rpc_gss_decode checksum failed", - __func__); - return (FALSE); - } - /* Verify checksum and QOP. */ - maj_stat = - gss_verify_mic(&min_stat, ctx, &databuf, &wrapbuf, - &qop_state); - gss_release_buffer(&min_stat, &wrapbuf); - - if (maj_stat != GSS_S_COMPLETE || qop_state != qop) { - gss_release_buffer(&min_stat, &databuf); - gss_log_status("gss_verify_mic", maj_stat, min_stat); - return (FALSE); - } - } else if (svc == RPCSEC_GSS_SVC_PRIVACY) { - /* Decode databody_priv. */ - if (!xdr_rpc_gss_decode(xdrs, &wrapbuf)) { - __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, - "%s() xdr_rpc_gss_decode databody_priv failed", - __func__); - return (FALSE); - } - /* Decrypt databody. */ - maj_stat = - gss_unwrap(&min_stat, ctx, &wrapbuf, &databuf, &conf_state, - &qop_state); + struct svc_req *req = priv; - gss_release_buffer(&min_stat, &wrapbuf); - - /* Verify encryption and QOP. */ - if (maj_stat != GSS_S_COMPLETE || qop_state != qop - || conf_state != TRUE) { - gss_release_buffer(&min_stat, &databuf); - gss_log_status("gss_unwrap", maj_stat, min_stat); - return (FALSE); - } - } - /* Decode rpc_gss_data_t (sequence number + arguments). */ - xdrmem_create(&tmpxdrs, databuf.value, databuf.length, XDR_DECODE); - SVC_CHECKSUM(req, databuf.value, databuf.length); - xdr_stat = (XDR_GETUINT32(&tmpxdrs, &seq_num) - && (*req->rq_msg.rm_xdr.proc) - (&tmpxdrs, req->rq_msg.rm_xdr.where)); - XDR_DESTROY(&tmpxdrs); - gss_release_buffer(&min_stat, &databuf); - - /* Verify sequence number. */ - if (xdr_stat == TRUE && seq_num != seq) { - __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, - "%s() wrong sequence number in databody", - __func__); - return (FALSE); - } - return (xdr_stat); + SVC_CHECKSUM(req, databuf, length); } static bool @@ -846,8 +765,10 @@ svcauth_gss_checksum(struct svc_req *req) } mutex_lock(&gd->lock); - result = xdr_rpc_gss_checksum(req, gd->ctx, gd->sec.qop, gd->sec.svc, - gc_seq); + result = xdr_rpc_gss_unwrap(req->rq_xdrs, req->rq_msg.rm_xdr.proc, + req->rq_msg.rm_xdr.where, gd->ctx, + gd->sec.qop, gd->sec.svc, gc_seq, + svcauth_gss_svc_checksum, req); mutex_unlock(&gd->lock); return (result); } diff --git a/src/svc_vc.c b/src/svc_vc.c index dd2980bbc0..38cc99f1bd 100644 --- a/src/svc_vc.c +++ b/src/svc_vc.c @@ -934,18 +934,12 @@ svc_vc_reply(struct svc_req *req) SVCXPRT *xprt = req->rq_xprt; struct xdr_ioq *xioq; - /* XXX Until gss_get_mic and gss_wrap can be replaced with - * iov equivalents, replies with RPCSEC_GSS security must be - * encoded in a contiguous buffer. - * - * Nb, we should probably use getpagesize() on Unix. Need + /* Nb, we should probably use getpagesize() on Unix. Need * an equivalent for Windows. */ xioq = xdr_ioq_create(RPC_MAXDATA_DEFAULT, __svc_params->ioq.send_max + RPC_MAXDATA_DEFAULT, - (req->rq_msg.cb_cred.oa_flavor == RPCSEC_GSS) - ? UIO_FLAG_REALLOC | UIO_FLAG_FREE - : UIO_FLAG_FREE); + UIO_FLAG_FREE); if (!xdr_reply_encode(xioq->xdrs, &req->rq_msg)) { __warnx(TIRPC_DEBUG_FLAG_ERROR, From 941e93abc30cf44e5ba25cf7f1136c8cfd0cc022 Mon Sep 17 00:00:00 2001 From: "Frank S. Filz" Date: Fri, 9 Aug 2019 13:58:41 -0700 Subject: [PATCH 36/70] Make xdr_ioq_putbufs usable for sending large buffers Signed-off-by: Frank S. Filz --- ntirpc/rpc/xdr.h | 1 + src/xdr_ioq.c | 47 +++++++++++++++++++++++++++++++++++++---------- 2 files changed, 38 insertions(+), 10 deletions(-) diff --git a/ntirpc/rpc/xdr.h b/ntirpc/rpc/xdr.h index 423c8bd036..aa72b32625 100644 --- a/ntirpc/rpc/xdr.h +++ b/ntirpc/rpc/xdr.h @@ -144,6 +144,7 @@ typedef struct xdr_vio { #define UIO_FLAG_GIFT 0x0004 #define UIO_FLAG_MORE 0x0008 #define UIO_FLAG_REALLOC 0x0010 +#define UIO_FLAG_REFER 0x0020 struct xdr_uio; typedef void (*xdr_uio_release)(struct xdr_uio *, u_int); diff --git a/src/xdr_ioq.c b/src/xdr_ioq.c index b4938abf32..b95f7b0c91 100644 --- a/src/xdr_ioq.c +++ b/src/xdr_ioq.c @@ -190,16 +190,16 @@ xdr_ioq_uv_recycle(struct poolq_head *ioqh, struct poolq_entry *have) void xdr_ioq_uv_release(struct xdr_ioq_uv *uv) { - if (uv->u.uio_refer) { - /* not optional in this case! */ - uv->u.uio_refer->uio_release(uv->u.uio_refer, UIO_FLAG_NONE); - uv->u.uio_refer = NULL; - } - if (!(--uv->u.uio_references)) { if (uv->u.uio_release) { /* handle both xdr_ioq_uv and vio */ uv->u.uio_release(&uv->u, UIO_FLAG_NONE); + } else if (uv->u.uio_flags & UIO_FLAG_REFER) { + /* not optional in this case! */ + __warnx(TIRPC_DEBUG_FLAG_XDR, "Call uio_release"); + uv->u.uio_refer->uio_release(uv->u.uio_refer, + UIO_FLAG_NONE); + mem_free(uv, sizeof(*uv)); } else if (uv->u.uio_flags & UIO_FLAG_FREE) { free_buffer(uv->v.vio_base, ioquv_size(uv)); mem_free(uv, sizeof(*uv)); @@ -583,6 +583,13 @@ xdr_ioq_putbufs(XDR *xdrs, xdr_uio *uio, u_int flags) xdr_vio *v; int ix; + /* update the most recent data length, just in case */ + xdr_tail_update(xdrs); + + __warnx(TIRPC_DEBUG_FLAG_XDR, + "%s Before putbufs - pos %lu", + __func__, (unsigned long) XDR_GETPOS(xdrs)); + for (ix = 0; ix < uio->uio_count; ++ix) { /* advance fill pointer, do not allocate buffers, refs =1 */ uv = xdr_ioq_uv_advance(XIOQ(xdrs)); @@ -592,9 +599,29 @@ xdr_ioq_putbufs(XDR *xdrs, xdr_uio *uio, u_int flags) xdr_ioq_uv_update(XIOQ(xdrs), uv); v = &(uio->uio_vio[ix]); - uv->u.uio_flags = UIO_FLAG_NONE; /* !RECLAIM */ + uv->u.uio_flags = UIO_FLAG_REFER; uv->v = *v; + /* save original buffer sequence for rele */ + uv->u.uio_refer = uio; + (uio->uio_references)++; + + /* Now update the XDR position */ + xdrs->x_data = uv->v.vio_tail; + xdrs->x_base = &uv->v; + xdrs->x_v = uv->v; + + __warnx(TIRPC_DEBUG_FLAG_XDR, + "%s After putbufs Examining xdr_ioq_uv %p (base %p head %p tail %p wrap %p len %lu full %lu) pos %lu", + __func__, uv, uv->v.vio_base, uv->v.vio_head, + uv->v.vio_tail, uv->v.vio_wrap, + (unsigned long) ioquv_length(uv), + (unsigned long) (uintptr_t)xdrs->x_v.vio_wrap + - (uintptr_t)xdrs->x_v.vio_head, + (unsigned long) XDR_GETPOS(xdrs)); + } + + return (TRUE); #if 0 Saved for later golden buttery results -- Matt if (flags & XDR_PUTBUFS_FLAG_BRELE) { @@ -626,7 +653,6 @@ Saved for later golden buttery results -- Matt uv->v.vio_head = 0; } } -#endif /* save original buffer sequence for rele */ if (ix == 0) { uv->u.uio_refer = uio; @@ -635,6 +661,7 @@ Saved for later golden buttery results -- Matt } return (TRUE); +#endif } /* @@ -1001,7 +1028,7 @@ xdr_ioq_use_or_allocate(struct xdr_ioq *xioq, xdr_vio *v, struct xdr_ioq_uv *uv) /* First we need to fit in and encode the length of the trailer */ xdr_vio vlen; - + __warnx(TIRPC_DEBUG_FLAG_XDR, "%s Fitting length xdr_ioq_uv %p (base %p head %p tail %p wrap %p) size %lu length %lu has %lu looking for 4", __func__, uv, uv->v.vio_base, uv->v.vio_head, @@ -1212,7 +1239,7 @@ xdr_ioq_allochdrs(XDR *xdrs, u_int start, xdr_vio *vector, int iov_count) /* Next vector buffer */ idx++; - } + } return true; } From c53407eaef9ba1726c7e6357f2e85a4230c45c0f Mon Sep 17 00:00:00 2001 From: "Frank S. Filz" Date: Fri, 27 Sep 2019 08:38:14 -0700 Subject: [PATCH 37/70] Remove error messages from several xdr functions Some callers need to try and append to an xdr stream and if the data doesn't fit, back up. These callers expect an error to occur so let's not throw worrying messages into the log. Signed-off-by: Frank S. Filz --- ntirpc/rpc/xdr_inline.h | 30 +++++------------------------- 1 file changed, 5 insertions(+), 25 deletions(-) diff --git a/ntirpc/rpc/xdr_inline.h b/ntirpc/rpc/xdr_inline.h index 33b8e76696..b551024344 100644 --- a/ntirpc/rpc/xdr_inline.h +++ b/ntirpc/rpc/xdr_inline.h @@ -482,12 +482,8 @@ xdr_opaque_encode(XDR *xdrs, const char *cp, u_int cnt) * XDR_INLINE is just as likely to do a function call, * so don't bother with it here. */ - if (!XDR_PUTBYTES(xdrs, cp, cnt)) { - __warnx(TIRPC_DEBUG_FLAG_ERROR, - "%s:%u ERROR opaque", - __func__, __LINE__); + if (!XDR_PUTBYTES(xdrs, cp, cnt)) return (false); - } /* * round byte count to full xdr units @@ -498,12 +494,8 @@ xdr_opaque_encode(XDR *xdrs, const char *cp, u_int cnt) uint32_t zero = 0; if (!XDR_PUTBYTES(xdrs, (char *) &zero, - BYTES_PER_XDR_UNIT - rndup)) { - __warnx(TIRPC_DEBUG_FLAG_ERROR, - "%s:%u ERROR zero", - __func__, __LINE__); + BYTES_PER_XDR_UNIT - rndup)) return (false); - } } return (true); @@ -640,12 +632,8 @@ xdr_bytes_encode(XDR *xdrs, char **cpp, u_int *sizep, u_int maxsize) return (false); } - if (!XDR_PUTUINT32(xdrs, size)) { - __warnx(TIRPC_DEBUG_FLAG_ERROR, - "%s:%u ERROR size", - __func__, __LINE__); + if (!XDR_PUTUINT32(xdrs, size)) return (false); - } return (xdr_opaque_encode(xdrs, sp, size)); } @@ -840,12 +828,8 @@ xdr_array_encode(XDR *xdrs, char **cpp, u_int *sizep, u_int maxsize, return (false); } - if (!XDR_PUTUINT32(xdrs, size)) { - __warnx(TIRPC_DEBUG_FLAG_ERROR, - "%s:%u ERROR size", - __func__, __LINE__); + if (!XDR_PUTUINT32(xdrs, size)) return (false); - } for (; (i < size) && stat; i++) { stat = (*xdr_elem) (xdrs, target); @@ -1002,12 +986,8 @@ xdr_string_encode(XDR *xdrs, char **cpp, u_int maxsize) return (false); } - if (!XDR_PUTUINT32(xdrs, size)) { - __warnx(TIRPC_DEBUG_FLAG_ERROR, - "%s:%u ERROR size", - __func__, __LINE__); + if (!XDR_PUTUINT32(xdrs, size)) return (false); - } return (xdr_opaque_encode(xdrs, *cpp, size)); } From f1c0cc056fc9777171d8178b50c15f6fe3928f4c Mon Sep 17 00:00:00 2001 From: Daniel Gryniewicz Date: Thu, 10 Oct 2019 14:38:03 -0400 Subject: [PATCH 38/70] LTTNG - fix xprt destroy trace to trace flags Signed-off-by: Daniel Gryniewicz --- ntirpc/lttng/xprt.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ntirpc/lttng/xprt.h b/ntirpc/lttng/xprt.h index c6c0c87024..d418a4c319 100644 --- a/ntirpc/lttng/xprt.h +++ b/ntirpc/lttng/xprt.h @@ -1,7 +1,7 @@ /* * vim:noexpandtab:shiftwidth=8:tabstop=8: * - * Copyright 2018 Red Hat, Inc. and/or its affiliates. + * Copyright 2018-2019 Red Hat, Inc. and/or its affiliates. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -90,7 +90,7 @@ TRACEPOINT_EVENT( ctf_string(fnc, function) ctf_integer(unsigned int, line, line) ctf_integer_hex(void *, xprt, xprt) - ctf_integer(uint16_t, count, flags) + ctf_integer_hex(uint16_t, flags, flags) ) ) From ec67d93fd8c051332585d6f75a32ef7c6a634657 Mon Sep 17 00:00:00 2001 From: Daniel Gryniewicz Date: Thu, 10 Oct 2019 14:38:51 -0400 Subject: [PATCH 39/70] IOQ - Fix svc_ioq_flush return value to be consistent In several error cases, it returned the wrong thing. Fix all cases to return 0 on success, <0 on failure, or EWOULDBLOCK if blocked. Signed-off-by: Daniel Gryniewicz --- src/svc_ioq.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/svc_ioq.c b/src/svc_ioq.c index e06fb0d68a..af30a196c2 100644 --- a/src/svc_ioq.c +++ b/src/svc_ioq.c @@ -70,6 +70,7 @@ #define LAST_FRAG_XDR_UNITS ((LAST_FRAG - 1) & ~(BYTES_PER_XDR_UNIT - 1)) #define MAXALLOCA (256) +/* Returns 0 on success, EWOULDBLOCK if would block, <0 on error */ static inline int svc_ioq_flushv(SVCXPRT *xprt, struct xdr_ioq *xioq) { @@ -192,7 +193,7 @@ svc_ioq_flushv(SVCXPRT *xprt, struct xdr_ioq *xioq) if (!XDR_FILLBUFS(xioq->xdrs, xioq->write_start, vio, fbytes)) { __warnx(TIRPC_DEBUG_FLAG_ERROR, "%s() XDR_FILLBUFS failed", __func__); - SVC_DESTROY(xprt); + error = -1; break; } @@ -244,6 +245,8 @@ svc_ioq_flushv(SVCXPRT *xprt, struct xdr_ioq *xioq) /* Socket buffer full; don't destroy */ error = EWOULDBLOCK; xioq->has_blocked = true; + } else { + error = result; } break; } From fbc17514c183d2c16a461cb8da0c5347891afe3f Mon Sep 17 00:00:00 2001 From: Daniel Gryniewicz Date: Thu, 10 Oct 2019 14:40:22 -0400 Subject: [PATCH 40/70] Fix xprt refcounting for non-blocking write Nonblocking write was missing a ref in the case where the first entry in the queue finished, causing the epoll to be unhooked. If the next entry blocked, then epoll is hooked again, but the hook path didn't take a ref, causing an extra unref later. This destroys the xprt, closing the connection, and hanging the client. Signed-off-by: Daniel Gryniewicz --- src/svc_rqst.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/svc_rqst.c b/src/svc_rqst.c index e35c630ee0..2fdc4cb4c8 100644 --- a/src/svc_rqst.c +++ b/src/svc_rqst.c @@ -665,6 +665,8 @@ svc_rqst_rearm_events_locked(SVCXPRT *xprt, uint16_t ev_flags) if (sr_rec->ev_flags & SVC_RQST_FLAG_SHUTDOWN) return (0); + /* Currently, can only be called with one of ADDED_RECV or ADDED_SEND, so we + * only need to take one ref. */ SVC_REF(xprt, SVC_REF_FLAG_NONE); /* assuming success */ @@ -993,6 +995,8 @@ svc_rqst_evchan_write(SVCXPRT *xprt, struct xdr_ioq *xioq, bool has_blocked) code = svc_rqst_rearm_events_locked(xprt, SVC_XPRT_FLAG_ADDED_SEND); } else { + /* svc_rqst_hook_events doesn't take a ref, so take one here */ + SVC_REF(xprt, SVC_REF_FLAG_NONE); code = svc_rqst_hook_events(rec, sr_rec, SVC_XPRT_FLAG_ADDED_SEND); } From 83a31500e81980953c14838a9b00a62414eba234 Mon Sep 17 00:00:00 2001 From: Daniel Gryniewicz Date: Thu, 10 Oct 2019 14:40:36 -0400 Subject: [PATCH 41/70] Remove release from svc_request() The new API for requests is asymmetric; that is, it's expected that the alloc callback will take a ref on the xprt, but svc_request() frees that. Fix this so that it's expected that the free callback will release this ref. Requires a fix in Ganesha. Signed-off-by: Daniel Gryniewicz --- src/svc_rqst.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/svc_rqst.c b/src/svc_rqst.c index 2fdc4cb4c8..a90fcf8581 100644 --- a/src/svc_rqst.c +++ b/src/svc_rqst.c @@ -1219,8 +1219,6 @@ enum xprt_stat svc_request(SVCXPRT *xprt, XDR *xdrs) __svc_params->free_cb(req, stat); - SVC_RELEASE(xprt, SVC_RELEASE_FLAG_NONE); - return stat; } From 2b3ede12207e437f31121628deb8e01aab16e40d Mon Sep 17 00:00:00 2001 From: Yang Ruifeng Date: Wed, 23 Oct 2019 18:38:19 +0800 Subject: [PATCH 42/70] SVC IOQ - correctly dequeue the xioq Signed-off-by: Yang Ruifeng --- src/svc_ioq.c | 82 +++++++++++++++++++++++++-------------------------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/src/svc_ioq.c b/src/svc_ioq.c index af30a196c2..46b944b257 100644 --- a/src/svc_ioq.c +++ b/src/svc_ioq.c @@ -320,13 +320,12 @@ void svc_ioq_write(SVCXPRT *xprt) tracepoint(xprt, mutex, __func__, __LINE__, xprt); #endif /* USE_LTTNG_NTIRPC */ mutex_lock(&rec->writeq.qmutex); - + /* Process the xioq from the head of the xprt queue */ have = TAILQ_FIRST(&rec->writeq.qh); + mutex_unlock(&rec->writeq.qmutex); while (have != NULL) { int rc = 0; - /* Process the xioq from the head of the xprt queue */ - mutex_unlock(&rec->writeq.qmutex); xioq = _IOQ(have); @@ -340,29 +339,22 @@ void svc_ioq_write(SVCXPRT *xprt) rc = svc_ioq_flushv(xprt, xioq); } - if (rc != EWOULDBLOCK) { - if (rc < 0) { - /* IO failed, destroy rather than releasing */ - __warnx(TIRPC_DEBUG_FLAG_SVC_VC, - "%s: %p fd %d About to destroy - rc = %d", - __func__, xprt, xprt->xp_fd, rc); - SVC_DESTROY(xprt); - } else { - __warnx(TIRPC_DEBUG_FLAG_SVC_VC, - "%s: %p fd %d About to release", - __func__, xprt, xprt->xp_fd); - SVC_RELEASE(xprt, SVC_RELEASE_FLAG_NONE); - } - - XDR_DESTROY(xioq->xdrs); - } - #ifdef USE_LTTNG_NTIRPC tracepoint(xprt, mutex, __func__, __LINE__, &rec->xprt); #endif /* USE_LTTNG_NTIRPC */ mutex_lock(&rec->writeq.qmutex); + if (rc < 0) { + /* Dequeue the failed request */ + TAILQ_REMOVE(&rec->writeq.qh, have, q); + mutex_unlock(&rec->writeq.qmutex); - if (rc == EWOULDBLOCK) { + /* IO failed, destroy rather than releasing */ + __warnx(TIRPC_DEBUG_FLAG_SVC_VC, + "%s: %p fd %d About to destroy - rc = %d", + __func__, xprt, xprt->xp_fd, rc); + SVC_DESTROY(xprt); + break; + } else if (rc == EWOULDBLOCK){ __warnx(TIRPC_DEBUG_FLAG_SVC_VC, "%s: %p fd %d EWOULDBLOCK", __func__, xprt, xprt->xp_fd); @@ -372,34 +364,42 @@ void svc_ioq_write(SVCXPRT *xprt) &rec->xprt); #endif /* USE_LTTNG_NTIRPC */ svc_rqst_evchan_write(xprt, xioq, has_blocked); + mutex_unlock(&rec->writeq.qmutex); break; - } else if (xioq->has_blocked) { - __warnx(TIRPC_DEBUG_FLAG_SVC_VC, - "%s: %p fd %d COMPLETED AFTER BLOCKING", - __func__, xprt, xprt->xp_fd); + } else { + if (xioq->has_blocked) { + __warnx(TIRPC_DEBUG_FLAG_SVC_VC, + "%s: %p fd %d COMPLETED AFTER BLOCKING", + __func__, xprt, xprt->xp_fd); #ifdef USE_LTTNG_NTIRPC - tracepoint(xprt, write_complete, __func__, __LINE__, - &rec->xprt, (int) xioq->has_blocked); + tracepoint(xprt, write_complete, __func__, __LINE__, + &rec->xprt, (int) xioq->has_blocked); #endif /* USE_LTTNG_NTIRPC */ - svc_rqst_xprt_send_complete(xprt); - } else { - __warnx(TIRPC_DEBUG_FLAG_SVC_VC, - "%s: %p fd %d COMPLETED", - __func__, xprt, xprt->xp_fd); + svc_rqst_xprt_send_complete(xprt); + } else { + __warnx(TIRPC_DEBUG_FLAG_SVC_VC, + "%s: %p fd %d COMPLETED", + __func__, xprt, xprt->xp_fd); #ifdef USE_LTTNG_NTIRPC - tracepoint(xprt, write_complete, __func__, __LINE__, - &rec->xprt, (int) xioq->has_blocked); + tracepoint(xprt, write_complete, __func__, __LINE__, + &rec->xprt, (int) xioq->has_blocked); #endif /* USE_LTTNG_NTIRPC */ - } + } - /* Dequeue the completed request */ - TAILQ_REMOVE(&rec->writeq.qh, have, q); + /* Dequeue the completed request */ + TAILQ_REMOVE(&rec->writeq.qh, have, q); - /* Fetch the next request */ - have = TAILQ_FIRST(&rec->writeq.qh); - } + /* Fetch the next request */ + have = TAILQ_FIRST(&rec->writeq.qh); + mutex_unlock(&rec->writeq.qmutex); - mutex_unlock(&rec->writeq.qmutex); + __warnx(TIRPC_DEBUG_FLAG_SVC_VC, + "%s: %p fd %d About to release", + __func__, xprt, xprt->xp_fd); + SVC_RELEASE(xprt, SVC_RELEASE_FLAG_NONE); + XDR_DESTROY(xioq->xdrs); + } + } } static void From dd7f90b80030d258576e96730e0cf1d5ee967c73 Mon Sep 17 00:00:00 2001 From: "Frank S. Filz" Date: Tue, 29 Oct 2019 14:46:52 -0700 Subject: [PATCH 43/70] Add some debug to show type of gss service Signed-off-by: Frank S. Filz --- src/auth_gss.c | 8 +++++++- src/svc_auth_gss.c | 9 +++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/src/auth_gss.c b/src/auth_gss.c index 654d1ed0cf..74e372f8b8 100644 --- a/src/auth_gss.c +++ b/src/auth_gss.c @@ -620,7 +620,13 @@ authgss_wrap(AUTH *auth, XDR *xdrs, xdrproc_t xdr_func, void *xdr_ptr) { struct rpc_gss_data *gd = AUTH_PRIVATE(auth); - __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, "%s()", __func__); + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, "%s() %d %s", __func__, + !gd->established ? 0 : gd->sec.svc, + !gd->established ? "not established" + : gd->sec.svc == RPCSEC_GSS_SVC_NONE ? "krb5" + : gd->sec.svc == RPCSEC_GSS_SVC_INTEGRITY ? "krb5i" + : gd->sec.svc == RPCSEC_GSS_SVC_PRIVACY ? "krb5p" + : "unknown"); if (!gd->established || gd->sec.svc == RPCSEC_GSS_SVC_NONE) return ((*xdr_func) (xdrs, xdr_ptr)); diff --git a/src/svc_auth_gss.c b/src/svc_auth_gss.c index ae9052c70a..87359cb699 100644 --- a/src/svc_auth_gss.c +++ b/src/svc_auth_gss.c @@ -716,6 +716,15 @@ svcauth_gss_wrap(struct svc_req *req, XDR *xdrs) req->rq_msg.rq_cred_body; bool result; + + __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, "%s() %d %s", __func__, + !gd->established ? 0 : gc->gc_svc, + !gd->established ? "not established" + : gc->gc_svc == RPCSEC_GSS_SVC_NONE ? "krb5" + : gc->gc_svc == RPCSEC_GSS_SVC_INTEGRITY ? "krb5i" + : gc->gc_svc == RPCSEC_GSS_SVC_PRIVACY ? "krb5p" + : "unknown"); + if (!gd->established || gc->gc_svc == RPCSEC_GSS_SVC_NONE) return (svc_auth_none.svc_ah_ops->svc_ah_wrap(req, xdrs)); From dd10d112dca8253a29e53b7097961f5633a6f2bb Mon Sep 17 00:00:00 2001 From: "Frank S. Filz" Date: Tue, 29 Oct 2019 16:22:32 -0700 Subject: [PATCH 44/70] Rework xdr_iov and gss_iov so VIO_TRAILER_LEN is an explicit buffer This doesn't change the code execution really but makes the debug look nicer. The VIO_TRAILER_LEN is not included in the gss_iov. Signed-off-by: Frank S. Filz --- ntirpc/rpc/xdr.h | 2 +- src/authgss_prot.c | 66 ++++++++++++++++++++++++------------------- src/xdr_ioq.c | 70 ++++++++++++++++++++++------------------------ 3 files changed, 73 insertions(+), 65 deletions(-) diff --git a/ntirpc/rpc/xdr.h b/ntirpc/rpc/xdr.h index aa72b32625..f773751d20 100644 --- a/ntirpc/rpc/xdr.h +++ b/ntirpc/rpc/xdr.h @@ -121,8 +121,8 @@ enum xdr_op { typedef enum vio_type { VIO_HEADER, /* header buffer before data */ VIO_DATA, /* data buffer */ + VIO_TRAILER_LEN, /* length field for following TRAILER buffer */ VIO_TRAILER, /* trailer buffer after data */ - VIO_TRAILER_LEN, /* trailer buffer that needs a length ahead */ } vio_type; /* XDR buffer vector descriptors */ diff --git a/src/authgss_prot.c b/src/authgss_prot.c index 2786105510..b01578f452 100644 --- a/src/authgss_prot.c +++ b/src/authgss_prot.c @@ -228,7 +228,7 @@ xdr_rpc_gss_wrap(XDR *xdrs, xdrproc_t xdr_func, void *xdr_ptr, { gss_buffer_desc databuf, wrapbuf; OM_uint32 maj_stat, min_stat; - int start, end, conf_state, iov_count, data_count, after_data, i; + int start, end, conf_state, xv_count, gv_count, data_count, after_data, i; bool xdr_stat, vector; u_int databuflen, maxwrapsz; gss_iov_buffer_desc *gss_iov = NULL; @@ -299,22 +299,26 @@ xdr_rpc_gss_wrap(XDR *xdrs, xdrproc_t xdr_func, void *xdr_ptr, } if (svc == RPCSEC_GSS_SVC_INTEGRITY) { - /* Add a trailer buffer for the MIC */ - iov_count = data_count + 1; + /* Add a trailer length (which won't be part of the gss_iov + * and trailer buffer for the MIC + */ + xv_count = data_count + 2; + gv_count = data_count + 1; after_data = data_count; } else if (svc == RPCSEC_GSS_SVC_PRIVACY) { /* Add header, padding, and trailer for the wrap */ - iov_count = data_count + 3; + xv_count = data_count + 3; + gv_count = data_count + 3; after_data = data_count + 1; } __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, - "data_count=%d, iov_count=%d, after_data=%d", - data_count, iov_count, after_data); + "data_count=%d, gv_count=%d, xv_count=%d, after_data=%d", + data_count, gv_count, xv_count, after_data); /* Determine the size of the gss_iov */ - gvsize = iov_count * sizeof(gss_iov_buffer_desc); - xvsize = iov_count * sizeof(xdr_vio); + gvsize = gv_count * sizeof(gss_iov_buffer_desc); + xvsize = xv_count * sizeof(xdr_vio); /* Allocate the gss_iov */ if (unlikely(gvsize > MAXALLOCA)) { @@ -322,13 +326,13 @@ xdr_rpc_gss_wrap(XDR *xdrs, xdrproc_t xdr_func, void *xdr_ptr, __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, "mem_alloc gss_iov=%p size %llu count %d", gss_iov, (unsigned long long) gvsize, - iov_count); + gv_count); } else { gss_iov = alloca(gvsize); __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, "alloca gss_iov=%p size %llu count %d", gss_iov, (unsigned long long) gvsize, - iov_count); + gv_count); } /* Allocate the xdr_iov */ @@ -337,13 +341,13 @@ xdr_rpc_gss_wrap(XDR *xdrs, xdrproc_t xdr_func, void *xdr_ptr, __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, "mem_alloc xdr_iov=%p size %llu count %d", xdr_iov, (unsigned long long) xvsize, - iov_count); + xv_count); } else { xdr_iov = alloca(xvsize); __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, "alloca xdr_iov=%p size %llu count %d", xdr_iov, (unsigned long long) xvsize, - iov_count); + xv_count); } memset(gss_iov, 0, gvsize); @@ -361,12 +365,12 @@ xdr_rpc_gss_wrap(XDR *xdrs, xdrproc_t xdr_func, void *xdr_ptr, xdr_stat = XDR_FILLBUFS(xdrs, start + 4, data, databuflen); /* Now show the gss_iov and xdr_iov */ - show_gss_xdr_iov(gss_iov, iov_count, xdr_iov, iov_count, + show_gss_xdr_iov(gss_iov, gv_count, xdr_iov, xv_count, "after XDR_FILLBUFS"); /* Now set up the gss_iov */ __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, "Set up gss_iov"); - for (i = 0; i < iov_count; i++) { + for (i = 0; i < gv_count; i++) { if (i == 0 && svc == RPCSEC_GSS_SVC_PRIVACY) { /* Fill in HEADER buffer */ gss_iov[i].type = GSS_IOV_BUFFER_TYPE_HEADER; @@ -394,7 +398,7 @@ xdr_rpc_gss_wrap(XDR *xdrs, xdrproc_t xdr_func, void *xdr_ptr, } /* Now show the gss_iov and xdr_iov */ - show_gss_xdr_iov(gss_iov, iov_count, xdr_iov, iov_count, + show_gss_xdr_iov(gss_iov, gv_count, xdr_iov, xv_count, "after setting up gss_iov"); /* At this point gss_iov HEADER, PADDING, and TRAILER have @@ -407,7 +411,7 @@ xdr_rpc_gss_wrap(XDR *xdrs, xdrproc_t xdr_func, void *xdr_ptr, if (svc == RPCSEC_GSS_SVC_INTEGRITY) { /* Now call gss_get_mic_iov_length */ maj_stat = gss_get_mic_iov_length(&min_stat, ctx, qop, - gss_iov, iov_count); + gss_iov, gv_count); if (maj_stat != GSS_S_COMPLETE) { __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, @@ -419,10 +423,14 @@ xdr_rpc_gss_wrap(XDR *xdrs, xdrproc_t xdr_func, void *xdr_ptr, goto out; } + /* Set up the VIO_TRAILER_LEN buffer in the xdr_iov */ + xdr_iov[after_data].vio_length = BYTES_PER_XDR_UNIT; + xdr_iov[after_data].vio_type = VIO_TRAILER_LEN; + /* Copy the TRAILER buffer length into the xdr_iov */ - xdr_iov[after_data].vio_length = + xdr_iov[after_data + 1].vio_length = gss_iov[after_data].buffer.length; - xdr_iov[after_data].vio_type = VIO_TRAILER_LEN; + xdr_iov[after_data + 1].vio_type = VIO_TRAILER; /* Marshal databody_integ length. Note tha this will * leave the cursor position at start + 4 but the @@ -444,7 +452,7 @@ xdr_rpc_gss_wrap(XDR *xdrs, xdrproc_t xdr_func, void *xdr_ptr, /* Now call gss_wrap_iov_length */ maj_stat = gss_wrap_iov_length(&min_stat, ctx, true, qop, GSS_C_QOP_DEFAULT, - gss_iov, iov_count); + gss_iov, gv_count); if (maj_stat != GSS_S_COMPLETE) { __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, @@ -500,7 +508,7 @@ xdr_rpc_gss_wrap(XDR *xdrs, xdrproc_t xdr_func, void *xdr_ptr, } /* Now show the gss_iov and xdr_iov */ - show_gss_xdr_iov(gss_iov, iov_count, xdr_iov, iov_count, + show_gss_xdr_iov(gss_iov, gv_count, xdr_iov, xv_count, "after gss_...length"); /* At this point: @@ -513,31 +521,33 @@ xdr_rpc_gss_wrap(XDR *xdrs, xdrproc_t xdr_func, void *xdr_ptr, * The cursor position will be updated to the end of the * TRAILER. */ - xdr_stat = XDR_ALLOCHDRS(xdrs, start + 4, xdr_iov, iov_count); + xdr_stat = XDR_ALLOCHDRS(xdrs, start + 4, xdr_iov, xv_count); if (!xdr_stat) goto out; /* Now show the gss_iov and xdr_iov */ - show_gss_xdr_iov(gss_iov, iov_count, xdr_iov, iov_count, + show_gss_xdr_iov(gss_iov, gv_count, xdr_iov, xv_count, "after XDR_ALLOCHDRS"); /* At this point the xdr_iov is completely filled in. */ if (svc == RPCSEC_GSS_SVC_INTEGRITY) { - /* Copy the TRAILER buffer into the gss_iov */ + /* Copy the TRAILER buffer into the gss_iov (remember + * it's AFTER the VIO_TRAILER_LEN buffer. + */ gss_iov[after_data].buffer.value = - xdr_iov[after_data].vio_head; + xdr_iov[after_data + 1].vio_head; /* At this point the gss_iov is completely filled in */ /* Now show the gss_iov and xdr_iov */ - show_gss_xdr_iov(gss_iov, iov_count, xdr_iov, iov_count, + show_gss_xdr_iov(gss_iov, gv_count, xdr_iov, xv_count, "just before gss_get_mic_iov"); /* Now call gss_get_mic_iov */ maj_stat = gss_get_mic_iov(&min_stat, ctx, qop, - gss_iov, iov_count); + gss_iov, gv_count); if (maj_stat != GSS_S_COMPLETE) { __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, @@ -563,13 +573,13 @@ xdr_rpc_gss_wrap(XDR *xdrs, xdrproc_t xdr_func, void *xdr_ptr, /* At this point the gss_iov is completely filled in */ /* Now show the gss_iov and xdr_iov */ - show_gss_xdr_iov(gss_iov, iov_count, xdr_iov, iov_count, + show_gss_xdr_iov(gss_iov, gv_count, xdr_iov, xv_count, "just before gss_wrap_iov"); /* Now call gss_wrap_iov */ maj_stat = gss_wrap_iov(&min_stat, ctx, true, GSS_C_QOP_DEFAULT, NULL, - gss_iov, iov_count); + gss_iov, gv_count); if (maj_stat != GSS_S_COMPLETE) { __warnx(TIRPC_DEBUG_FLAG_RPCSEC_GSS, diff --git a/src/xdr_ioq.c b/src/xdr_ioq.c index b95f7b0c91..3054c22fe5 100644 --- a/src/xdr_ioq.c +++ b/src/xdr_ioq.c @@ -1024,39 +1024,6 @@ xdr_ioq_use_or_allocate(struct xdr_ioq *xioq, xdr_vio *v, struct xdr_ioq_uv *uv) */ uint32_t htlen = v->vio_length; - if (v->vio_type == VIO_TRAILER_LEN) { - /* First we need to fit in and encode the length of the trailer - */ - xdr_vio vlen; - - __warnx(TIRPC_DEBUG_FLAG_XDR, - "%s Fitting length xdr_ioq_uv %p (base %p head %p tail %p wrap %p) size %lu length %lu has %lu looking for 4", - __func__, uv, uv->v.vio_base, uv->v.vio_head, - uv->v.vio_tail, uv->v.vio_wrap, - (unsigned long) ioquv_size(uv), - (unsigned long) ioquv_length(uv), - (unsigned long) ioquv_more(uv)); - - /* Set up a dummy xdr_vio for the length */ - memset(&vlen, 0, sizeof(vlen)); - vlen.vio_type = VIO_TRAILER; - vlen.vio_length = 4; - - /* Now recursively call to get space for the length */ - uv = xdr_ioq_use_or_allocate(xioq, &vlen, uv); - - /* Now we have space, either in the previous buffer or a new - * buffer, go ahead and encode the length into it. - */ - *((uint32_t *) (vlen.vio_head)) = - (uint32_t) htonl(v->vio_length); - - /* Becasue we have already set up the gss_iov, it's ok to - * sneak the length it, it won't be part of the gss_iov but it - * IS part of the xdr_iov. - */ - } - __warnx(TIRPC_DEBUG_FLAG_XDR, "%s Examining xdr_ioq_uv %p (base %p head %p tail %p wrap %p) size %lu length %lu has %lu looking for %lu", __func__, uv, uv->v.vio_base, uv->v.vio_head, @@ -1065,7 +1032,7 @@ xdr_ioq_use_or_allocate(struct xdr_ioq *xioq, xdr_vio *v, struct xdr_ioq_uv *uv) (unsigned long) ioquv_more(uv), htlen); if (ioquv_more(uv) >= htlen) { - /* The HEADER will fit */ + /* The HEADER or TRAILER will fit */ v->vio_base = uv->v.vio_base; v->vio_head = uv->v.vio_tail; v->vio_tail = uv->v.vio_tail + htlen; @@ -1111,6 +1078,16 @@ xdr_ioq_use_or_allocate(struct xdr_ioq *xioq, xdr_vio *v, struct xdr_ioq_uv *uv) uv->v.vio_tail = v->vio_tail; } + if (v->vio_type == VIO_TRAILER_LEN) { + /* Now that we have buffer space for the trailer len, we can + * peek ahead to the next buffer and get it's length and fill + * the length into the buffer. Note that this buffer is not + * part of the gss_iov. + */ + *((uint32_t *) (v[0].vio_head)) = + (uint32_t) htonl(v[1].vio_length); + } + __warnx(TIRPC_DEBUG_FLAG_XDR, "%s Produced xdr_ioq_uv %p (base %p head %p tail %p wrap %p) size %lu length %lu", __func__, uv, uv->v.vio_base, uv->v.vio_head, @@ -1231,9 +1208,30 @@ xdr_ioq_allochdrs(XDR *xdrs, u_int start, xdr_vio *vector, int iov_count) while (idx < iov_count) { /* Another TRAILER buffer to manage */ + vio_type vt = vector[idx].vio_type; + __warnx(TIRPC_DEBUG_FLAG_XDR, - "Calling xdr_ioq_use_or_allocate for idx %d for VIO_TRAILER", - idx); + "Calling xdr_ioq_use_or_allocate for idx %d for %s", + idx, + vt == VIO_HEADER ? "VIO_HEADER" + : vt == VIO_DATA ? "VIO_DATA" + : vt == VIO_TRAILER_LEN ? "VIO_TRAILER_LEN" + : vt == VIO_TRAILER ? "VIO_TRAILER" + : "UNKNOWN"); + + if (vt != VIO_TRAILER && vt != VIO_TRAILER_LEN) { + __warnx(TIRPC_DEBUG_FLAG_XDR, + "Oops, buffer other than a trailer found after all data"); + return false; + } + + if (vt == VIO_TRAILER_LEN && + ((idx + 1) == iov_count || + vector[idx + 1].vio_type != VIO_TRAILER)) { + __warnx(TIRPC_DEBUG_FLAG_XDR, + "Oops, VIO_TRAILER_LEN not followed by VIO_TRAILER"); + return false; + } uv = xdr_ioq_use_or_allocate(xioq, &vector[idx], uv); From 9765afe12d9079dbbaef1e4d3d0a8d7b2104c038 Mon Sep 17 00:00:00 2001 From: Daniel Gryniewicz Date: Mon, 21 Oct 2019 08:53:11 -0400 Subject: [PATCH 45/70] XDR IOQ - Update position to end of XDR When calling allochdrs(), it needs to update the posision to the end of the XDR, or else only partial data is sent. The position had been previously updated to the location of the length field. Signed-off-by: Daniel Gryniewicz Signed-off-by: Frank S. Filz --- src/xdr_ioq.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/xdr_ioq.c b/src/xdr_ioq.c index 3054c22fe5..e56e6ad203 100644 --- a/src/xdr_ioq.c +++ b/src/xdr_ioq.c @@ -1106,6 +1106,7 @@ xdr_ioq_allochdrs(XDR *xdrs, u_int start, xdr_vio *vector, int iov_count) int idx = 0; struct xdr_ioq *xioq = XIOQ(xdrs); struct poolq_entry *have; + u_int totlen = start; /* update the most recent data length, just in case */ xdr_tail_update(xdrs); @@ -1175,6 +1176,9 @@ xdr_ioq_allochdrs(XDR *xdrs, u_int start, xdr_vio *vector, int iov_count) uv = xdr_ioq_use_or_allocate(xioq, &vector[idx], uv); + /* Record used space */ + totlen += vector[idx].vio_length; + /* Advance to next (DATA) buffer */ idx++; } @@ -1196,6 +1200,9 @@ xdr_ioq_allochdrs(XDR *xdrs, u_int start, xdr_vio *vector, int iov_count) "Skipping idx %d for VIO_DATA", idx); + /* Record used space */ + totlen += vector[idx].vio_length; + if (have != NULL) { /* Next buffer exists */ uv = IOQ_(have); @@ -1235,10 +1242,16 @@ xdr_ioq_allochdrs(XDR *xdrs, u_int start, xdr_vio *vector, int iov_count) uv = xdr_ioq_use_or_allocate(xioq, &vector[idx], uv); + /* Record used space */ + totlen += vector[idx].vio_length; + /* Next vector buffer */ idx++; } + /* Update position to end of the last buffer */ + XDR_SETPOS(xdrs, start + totlen); + return true; } From 0695d2e364dbcd1a5ae9072eaf85b67c0172551e Mon Sep 17 00:00:00 2001 From: "Frank S. Filz" Date: Wed, 30 Oct 2019 13:22:10 -0700 Subject: [PATCH 46/70] Oops - need to SETPOS just to totlen Signed-off-by: Frank S. Filz --- src/xdr_ioq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/xdr_ioq.c b/src/xdr_ioq.c index e56e6ad203..eef5efebed 100644 --- a/src/xdr_ioq.c +++ b/src/xdr_ioq.c @@ -1250,7 +1250,7 @@ xdr_ioq_allochdrs(XDR *xdrs, u_int start, xdr_vio *vector, int iov_count) } /* Update position to end of the last buffer */ - XDR_SETPOS(xdrs, start + totlen); + XDR_SETPOS(xdrs, totlen); return true; } From e1bdb9c4cef7a31ae63c85b9c3b8044a46582b19 Mon Sep 17 00:00:00 2001 From: Daniel Gryniewicz Date: Thu, 31 Oct 2019 13:18:41 -0400 Subject: [PATCH 47/70] 3.0-rc1 Signed-off-by: Daniel Gryniewicz --- CMakeLists.txt | 16 ++++++++++++---- src/libntirpc.map.in.cmake | 2 +- version-h.in.cmake | 3 ++- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 878dcf40b3..554f0fffbf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,17 +13,25 @@ include(${CMAKE_SOURCE_DIR}/cmake/maintainer_mode.cmake) project(NTIRPC C) # version numbers -set(NTIRPC_MAJOR_VERSION 1) -set(NTIRPC_MINOR_VERSION 8) -set(NTIRPC_PATCH_LEVEL 0) +set(NTIRPC_MAJOR_VERSION 3) +set(NTIRPC_MINOR_VERSION 0) +# This is .0 for a release, .N for a stable branch, blank for development +set(NTIRPC_PATCH_LEVEL ) +# This is the dev/RC version. Should be blank on release/stable. Format is +# -something +set(NTIRPC_VERSION_EXTRA -rc1) set(VERSION_COMMENT "Full-duplex and bi-directional ONC RPC on TCP." ) # version string used for packaging set(NTIRPC_VERSION - "${NTIRPC_MAJOR_VERSION}.${NTIRPC_MINOR_VERSION}.${NTIRPC_PATCH_LEVEL}") + "${NTIRPC_MAJOR_VERSION}.${NTIRPC_MINOR_VERSION}${NTIRPC_PATCH_LEVEL}${NTIRPC_VERSION_EXTRA}") +set(NTIRPC_VERSION_BASE + "${NTIRPC_MAJOR_VERSION}.${NTIRPC_MINOR_VERSION}${NTIRPC_PATCH_LEVEL}") # Up scope for embedding in ganesha set(NTIRPC_VERSION_EMBED "${NTIRPC_VERSION}" PARENT_SCOPE) +set(NTIRPC_VERSION_BASE_EMBED "${NTIRPC_VERSION_BASE}" PARENT_SCOPE) +set(NTIRPC_VERSION_EXTRA_EMBED "${NTIRPC_VERSION_EXTRA}" PARENT_SCOPE) set(NTIRPC_ABI_EMBED "${NTIRPC_MAJOR_VERSION}.${NTIRPC_MINOR_VERSION}" PARENT_SCOPE) set( PACKNAME "${NTIRPC_VERSION}" ) diff --git a/src/libntirpc.map.in.cmake b/src/libntirpc.map.in.cmake index 7cec34f02d..b6ab0d9f2c 100644 --- a/src/libntirpc.map.in.cmake +++ b/src/libntirpc.map.in.cmake @@ -1,4 +1,4 @@ -NTIRPC_${NTIRPC_VERSION} { +NTIRPC_${NTIRPC_VERSION_BASE} { global: # __* __ntirpc_pkg_params; diff --git a/version-h.in.cmake b/version-h.in.cmake index e7a859e5fb..692500a7d8 100644 --- a/version-h.in.cmake +++ b/version-h.in.cmake @@ -6,8 +6,9 @@ #define NTIRPC_VERSION_MAJOR @NTIRPC_MAJOR_VERSION@ #define NTIRPC_VERSION_MINOR @NTIRPC_MINOR_VERSION@ #define NTIRPC_PATCH_LEVEL @NTIRPC_PATCH_LEVEL@ +#define NTIRPC_VERSION_EXTRA @NTIRPC_VERSION_EXTRA@ -#define NTIRPC_VERSION "@NTIRPC_MAJOR_VERSION@.@NTIRPC_MINOR_VERSION@.@NTIRPC_PATCH_LEVEL@" +#define NTIRPC_VERSION "@NTIRPC_VERSION@" #define NTIRPC_VERSION_COMMENT "@VERSION_COMMENT@" #define _GIT_HEAD_COMMIT "@_GIT_HEAD_COMMIT@" #define _GIT_DESCRIBE "@_GIT_DESCRIBE@" From 785a6efd7cefa22fbc1cc7255fa4d33073c0d91e Mon Sep 17 00:00:00 2001 From: Daniel Gryniewicz Date: Thu, 31 Oct 2019 15:21:38 -0400 Subject: [PATCH 48/70] 3.0-rc2 rc1 was tagged, but did not update the version, so set to rc2 so we can tag. Signed-off-by: Daniel Gryniewicz --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 554f0fffbf..1d467247df 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -19,7 +19,7 @@ set(NTIRPC_MINOR_VERSION 0) set(NTIRPC_PATCH_LEVEL ) # This is the dev/RC version. Should be blank on release/stable. Format is # -something -set(NTIRPC_VERSION_EXTRA -rc1) +set(NTIRPC_VERSION_EXTRA -rc2) set(VERSION_COMMENT "Full-duplex and bi-directional ONC RPC on TCP." ) From 1da6533431a23af7406b5961d4b16ef61045b6af Mon Sep 17 00:00:00 2001 From: Daniel Gryniewicz Date: Wed, 6 Nov 2019 13:48:21 -0500 Subject: [PATCH 49/70] 3.0 Signed-off-by: Daniel Gryniewicz --- CMakeLists.txt | 14 ++++++-------- config-h.in.cmake | 4 ++-- src/CMakeLists.txt | 2 +- src/lttng/CMakeLists.txt | 2 +- version-h.in.cmake | 1 - 5 files changed, 10 insertions(+), 13 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1d467247df..3a58db0de7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,25 +14,23 @@ project(NTIRPC C) # version numbers set(NTIRPC_MAJOR_VERSION 3) -set(NTIRPC_MINOR_VERSION 0) # This is .0 for a release, .N for a stable branch, blank for development -set(NTIRPC_PATCH_LEVEL ) -# This is the dev/RC version. Should be blank on release/stable. Format is -# -something -set(NTIRPC_VERSION_EXTRA -rc2) +set(NTIRPC_MINOR_VERSION .0) +# -something for dev releases +set(NTIRPC_VERSION_EXTRA ) set(VERSION_COMMENT "Full-duplex and bi-directional ONC RPC on TCP." ) # version string used for packaging set(NTIRPC_VERSION - "${NTIRPC_MAJOR_VERSION}.${NTIRPC_MINOR_VERSION}${NTIRPC_PATCH_LEVEL}${NTIRPC_VERSION_EXTRA}") + "${NTIRPC_MAJOR_VERSION}${NTIRPC_MINOR_VERSION}${NTIRPC_VERSION_EXTRA}") set(NTIRPC_VERSION_BASE - "${NTIRPC_MAJOR_VERSION}.${NTIRPC_MINOR_VERSION}${NTIRPC_PATCH_LEVEL}") + "${NTIRPC_MAJOR_VERSION}${NTIRPC_MINOR_VERSION}") # Up scope for embedding in ganesha set(NTIRPC_VERSION_EMBED "${NTIRPC_VERSION}" PARENT_SCOPE) set(NTIRPC_VERSION_BASE_EMBED "${NTIRPC_VERSION_BASE}" PARENT_SCOPE) set(NTIRPC_VERSION_EXTRA_EMBED "${NTIRPC_VERSION_EXTRA}" PARENT_SCOPE) -set(NTIRPC_ABI_EMBED "${NTIRPC_MAJOR_VERSION}.${NTIRPC_MINOR_VERSION}" PARENT_SCOPE) +set(NTIRPC_ABI_EMBED "${NTIRPC_MAJOR_VERSION}${NTIRPC_MINOR_VERSION}" PARENT_SCOPE) set( PACKNAME "${NTIRPC_VERSION}" ) diff --git a/config-h.in.cmake b/config-h.in.cmake index e259d6e9ae..8eb2ddf7bd 100644 --- a/config-h.in.cmake +++ b/config-h.in.cmake @@ -25,9 +25,9 @@ #define PACKAGE "libntirpc" #define PACKAGE_BUGREPORT "" #define PACKAGE_NAME "libntirpc" -#define PACKAGE_STRING "libntirpc ${NTIRPC_VERSION_MAJOR}.${NTIRPC_VERSION_MINOR}.${NTIRPC_PATCH_LEVEL}" +#define PACKAGE_STRING "libntirpc ${NTIRPC_VERSION_MAJOR}${NTIRPC_VERSION_MINOR}" #define PACKAGE_TARNAME "libntirpc" #define PACKAGE_URL "" -#define PACKAGE_VERSION "${NTIRPC_VERSION_MAJOR}.${NTIRPC_VERSION_MINOR}.${NTIRPC_PATCH_LEVEL}" +#define PACKAGE_VERSION "${NTIRPC_VERSION_MAJOR}${NTIRPC_VERSION_MINOR}" #endif /* CONFIG_H */ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 34b025d19f..4bb9256236 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -107,7 +107,7 @@ target_link_libraries(ntirpc ${SYSTEM_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT}) set_target_properties(ntirpc PROPERTIES LINK_FLAGS "-Wl,--version-script=${PROJECT_BINARY_DIR}/libntirpc.map" VERSION ${NTIRPC_VERSION} - SOVERSION "${NTIRPC_MAJOR_VERSION}.${NTIRPC_MINOR_VERSION}" + SOVERSION "${NTIRPC_MAJOR_VERSION}${NTIRPC_MINOR_VERSION}" ) install(TARGETS ntirpc DESTINATION ${LIB_INSTALL_DIR}) diff --git a/src/lttng/CMakeLists.txt b/src/lttng/CMakeLists.txt index d621a58473..b529ef4c75 100644 --- a/src/lttng/CMakeLists.txt +++ b/src/lttng/CMakeLists.txt @@ -16,7 +16,7 @@ target_link_libraries(ntirpc_tracepoints set_target_properties(ntirpc_tracepoints PROPERTIES VERSION ${NTIRPC_VERSION} - SOVERSION "${NTIRPC_MAJOR_VERSION}.${NTIRPC_MINOR_VERSION}" + SOVERSION "${NTIRPC_MAJOR_VERSION}${NTIRPC_MINOR_VERSION}" ) install(TARGETS ntirpc_tracepoints COMPONENT tracing DESTINATION ${LIB_INSTALL_DIR} ) diff --git a/version-h.in.cmake b/version-h.in.cmake index 692500a7d8..7c8c75c7de 100644 --- a/version-h.in.cmake +++ b/version-h.in.cmake @@ -5,7 +5,6 @@ #define NTIRPC_VERSION_MAJOR @NTIRPC_MAJOR_VERSION@ #define NTIRPC_VERSION_MINOR @NTIRPC_MINOR_VERSION@ -#define NTIRPC_PATCH_LEVEL @NTIRPC_PATCH_LEVEL@ #define NTIRPC_VERSION_EXTRA @NTIRPC_VERSION_EXTRA@ #define NTIRPC_VERSION "@NTIRPC_VERSION@" From 23966af9f0060d9f1b6dff2364601d6fab9491dc Mon Sep 17 00:00:00 2001 From: "Frank S. Filz" Date: Tue, 19 Nov 2019 10:26:48 -0800 Subject: [PATCH 50/70] Move async callback to svc_req and give it its own wpe Signed-off-by: Frank S. Filz --- ntirpc/rpc/svc.h | 6 ++++-- src/svc_rqst.c | 15 ++++++--------- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/ntirpc/rpc/svc.h b/ntirpc/rpc/svc.h index d3eded4315..6c90493aaf 100644 --- a/ntirpc/rpc/svc.h +++ b/ntirpc/rpc/svc.h @@ -239,8 +239,6 @@ struct svc_xprt { svc_req_fun_t process_cb; svc_xprt_fun_t rendezvous_cb; } xp_dispatch; - /* Handle resumed requests */ - svc_req_fun_t xp_resume_cb; SVCXPRT *xp_parent; char *xp_tp; /* transport provider device name */ @@ -317,6 +315,10 @@ struct svc_req { void *rq_ap1; /* auth private */ void *rq_ap2; /* auth private */ + /* Handle resumed requests */ + svc_req_fun_t rq_resume_cb; + struct work_pool_entry rq_wpe; + /* avoid separate alloc/free */ struct rpc_msg rq_msg; diff --git a/src/svc_rqst.c b/src/svc_rqst.c index a90fcf8581..a5525b912e 100644 --- a/src/svc_rqst.c +++ b/src/svc_rqst.c @@ -1224,14 +1224,13 @@ enum xprt_stat svc_request(SVCXPRT *xprt, XDR *xdrs) static void svc_resume_task(struct work_pool_entry *wpe) { - struct rpc_dplx_rec *rec = - opr_containerof(wpe, struct rpc_dplx_rec, ioq.ioq_wpe); - struct svc_req *req = rec->svc_req; - SVCXPRT *xprt = &rec->xprt; + struct svc_req *req = + opr_containerof(wpe, struct svc_req, rq_wpe); + SVCXPRT *xprt = req->rq_xprt; enum xprt_stat stat; /* Resume the request. */ - stat = req->rq_xprt->xp_resume_cb(req); + stat = req->rq_resume_cb(req); if (stat == XPRT_SUSPEND) { /* The rquest is suspended, don't touch the request in any way @@ -1253,10 +1252,8 @@ static void svc_resume_task(struct work_pool_entry *wpe) void svc_resume(struct svc_req *req) { - struct rpc_dplx_rec *rpc_dplx_rec = REC_XPRT(req->rq_xprt); - - rpc_dplx_rec->ioq.ioq_wpe.fun = svc_resume_task; - work_pool_submit(&svc_work_pool, &(rpc_dplx_rec->ioq.ioq_wpe)); + req->rq_wpe.fun = svc_resume_task; + work_pool_submit(&svc_work_pool, &req->rq_wpe); } /*static*/ void From 9cd6c50a50c89b77ed7972ae487b34d36235bed0 Mon Sep 17 00:00:00 2001 From: Daniel Gryniewicz Date: Tue, 19 Nov 2019 13:58:47 -0500 Subject: [PATCH 51/70] Add make rpm target Add a make target to build rpms. Also, do a "make install" in the spec file to handle the case where the build dir isn't the source dir. Signed-off-by: Daniel Gryniewicz --- CMakeLists.txt | 16 ++++++++++++++-- libntirpc.spec-in.cmake | 4 ++-- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3a58db0de7..6325ad7818 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -261,8 +261,6 @@ configure_file( ) if (NOT TARGET dist) -# Define CPACK component (to deal with sub packages) -set(CPACK_COMPONENTS_ALL daemon fsal headers ) set(CPACK_COMPONENT_DAEMON_DISPLAY_NAME "libntirpc") # Include custom config and cpack module @@ -273,6 +271,20 @@ set( PKG_NAME "${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION}.tar.gz") add_custom_target(dist COMMAND ${CMAKE_MAKE_PROGRAM} package_source) endif (NOT TARGET dist) +if (NOT TARGET rpm) +add_custom_target( rpm DEPENDS dist) +add_custom_command(TARGET rpm + COMMAND sh -c "rpmbuild -ta ${PKG_NAME}" + VERBATIM + DEPENDS dist) + +set(RPMDEST "--define '_srcrpmdir ${CMAKE_CURRENT_BINARY_DIR}'") +add_custom_target( srpm DEPENDS dist) +add_custom_command(TARGET srpm + COMMAND sh -c "rpmbuild ${RPMDEST} -ts ${PKG_NAME}" + VERBATIM + DEPENDS dist) +endif (NOT TARGET rpm) ########### install files ############### install(FILES ${PROJECT_BINARY_DIR}/libntirpc.pc DESTINATION ${LIB_INSTALL_DIR}/pkgconfig) diff --git a/libntirpc.spec-in.cmake b/libntirpc.spec-in.cmake index aa139300e8..399ca57ae8 100644 --- a/libntirpc.spec-in.cmake +++ b/libntirpc.spec-in.cmake @@ -53,8 +53,8 @@ install -p -m 0755 src/%{name}.so.%{version} %{buildroot}%{_libdir}/ ln -s %{name}.so.%{version} %{buildroot}%{_libdir}/%{name}.so.1 ln -s %{name}.so.%{version} %{buildroot}%{_libdir}/%{name}.so mkdir -p %{buildroot}%{_includedir}/ntirpc -cp -a ntirpc %{buildroot}%{_includedir}/ -install -p -m 644 libntirpc.pc %{buildroot}%{_libdir}/pkgconfig/ + +make DESTDIR=%{buildroot} install %post -p /sbin/ldconfig From a2af9be2584c0060ddb22056a95410f55b04e212 Mon Sep 17 00:00:00 2001 From: TweakySolution Date: Tue, 10 Dec 2019 09:22:52 -0600 Subject: [PATCH 52/70] SVC_RECV can suspend SVC_RECV call may suspend; particularly for async I/O in which response processing is handled later. Signed-off-by: TweakySolution --- src/svc_rqst.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/svc_rqst.c b/src/svc_rqst.c index e35c630ee0..e4150dd903 100644 --- a/src/svc_rqst.c +++ b/src/svc_rqst.c @@ -1162,6 +1162,7 @@ svc_rqst_xprt_task_recv(struct work_pool_entry *wpe) struct xdr_ioq *ioq = opr_containerof(wpe, struct xdr_ioq, ioq_wpe); struct rpc_dplx_rec *rec = ioq->rec; + enum xprt_stat stat = XPRT_IDLE; atomic_clear_uint16_t_bits(&ioq->ioq_s.qflags, IOQ_FLAG_WORKING); @@ -1179,11 +1180,13 @@ svc_rqst_xprt_task_recv(struct work_pool_entry *wpe) * xp_refcnt need more than 1 (this task). */ (void)clock_gettime(CLOCK_MONOTONIC_FAST, &rec->recv.ts); - (void)SVC_RECV(&rec->xprt); + stat = SVC_RECV(&rec->xprt); } - /* If tests fail, log non-fatal "WARNING! already destroying!" */ - SVC_RELEASE(&rec->xprt, SVC_RELEASE_FLAG_NONE); + if (stat != XPRT_SUSPEND) { + /* If tests fail, log non-fatal "WARNING! already destroying!" */ + SVC_RELEASE(&rec->xprt, SVC_RELEASE_FLAG_NONE); + } } enum xprt_stat svc_request(SVCXPRT *xprt, XDR *xdrs) From e5319c7b265ab8b8145f618143d0ea5ca8298f99 Mon Sep 17 00:00:00 2001 From: Daniel Gryniewicz Date: Tue, 10 Dec 2019 13:16:55 -0500 Subject: [PATCH 53/70] Make sure auth flavor of response is set Even if the auth fails, the caller might want to know the auth flavor. Set the flavor early, so that it's always correct. Signed-off-by: Daniel Gryniewicz --- src/svc_auth.c | 1 + src/svc_auth_gss.c | 3 --- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/svc_auth.c b/src/svc_auth.c index 16c85eabe2..8d942d2387 100644 --- a/src/svc_auth.c +++ b/src/svc_auth.c @@ -92,6 +92,7 @@ svc_auth_authenticate(struct svc_req *req, bool *no_dispatch) /* VARIABLES PROTECTED BY authsvc_lock: asp, Auths */ req->rq_msg.RPCM_ack.ar_verf = _null_auth; cred_flavor = req->rq_msg.cb_cred.oa_flavor; + req->rq_msg.RPCM_ack.ar_verf.oa_flavor = cred_flavor; switch (cred_flavor) { #ifdef _HAVE_GSSAPI case RPCSEC_GSS: diff --git a/src/svc_auth_gss.c b/src/svc_auth_gss.c index 87359cb699..c2376e3711 100644 --- a/src/svc_auth_gss.c +++ b/src/svc_auth_gss.c @@ -397,9 +397,6 @@ _svcauth_gss(struct svc_req *req, bool *no_dispatch) OM_uint32 min_stat; enum auth_stat rc = AUTH_OK; - /* Initialize reply. */ - req->rq_msg.RPCM_ack.ar_verf = _null_auth; - /* Unserialize client credentials. */ if (req->rq_msg.cb_cred.oa_length <= 0) { return AUTH_BADCRED; From 373a7148f7cd3e0cd13feecee67c43adb7405296 Mon Sep 17 00:00:00 2001 From: TweakySolution Date: Wed, 11 Dec 2019 08:08:22 -0600 Subject: [PATCH 54/70] SVC Request Rearm for Send Event Correct ev flags being sent to epoll_ctl to reference event_send when rearming events. Signed-off-by: TweakySolution --- src/svc_rqst.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/svc_rqst.c b/src/svc_rqst.c index a90fcf8581..89ec65a078 100644 --- a/src/svc_rqst.c +++ b/src/svc_rqst.c @@ -717,7 +717,7 @@ svc_rqst_rearm_events_locked(SVCXPRT *xprt, uint16_t ev_flags) } if (ev_flags & SVC_XPRT_FLAG_ADDED_SEND) { - ev = &rec->ev_u.epoll.event_recv; + ev = &rec->ev_u.epoll.event_send; /* set up epoll user data */ ev->data.ptr = rec; From 96018f8fcd4194cb70e9b4636f918e807d3b5e84 Mon Sep 17 00:00:00 2001 From: "Kaleb S. KEITHLEY" Date: Wed, 11 Dec 2019 15:03:59 -0500 Subject: [PATCH 55/70] svc_dg.c: cleanup pktinfo handling Prior to this change it was seen that the MSG_CTRUNC flag was set in the data returned by recvmsg(), an indication that that the controlmsg (i.e. pktinfo) was lost due to insufficient space. Increasing the size of struct svc_dg_xprt by making sc_cmsg an array of pktinfo fixes this. It was also seen that when using an IPv4 socket that two pktinfos are recv'd: an in_pktinfo and an in6_pktinfo. (One comment I saw somewhere said that it is related to ipv6-tunneled-over-ipv4.) In this change the in6_pktinfo is ignored/discarded and only the in_pktinfo is saved for later use in the reply. Presumably an IPv6 socket receives just a single in6_pktinfo, which is saved. Lastly, despite a comment that seemed to imply that pktinfo is/was set in the reply, there was no logic to actually set it; with the result that the kernel had no hints about which source address to set in the packet and defaulting to sending it with the lowest numeric IP address on the NIC. This may or may not have been the correct address, i.e. the address that the request was actually sent to. Discussions with the knfs team confirmed that NFS clients would consider this an error. And it was seen that the client's network stack could reject such packets in certain circumstances, and the NFS client in failing to receive such a reply would resend the requests (with those replies rejected as well) before eventually failing over to sending TCP requests as a (last ditch) attempt to resolve the mount. --- ntirpc/rpc/svc.h | 7 +++ src/svc_dg.c | 109 +++++++++++++++++++++++---------------------- src/svc_internal.h | 3 +- 3 files changed, 65 insertions(+), 54 deletions(-) diff --git a/ntirpc/rpc/svc.h b/ntirpc/rpc/svc.h index d3eded4315..524b4bcae3 100644 --- a/ntirpc/rpc/svc.h +++ b/ntirpc/rpc/svc.h @@ -273,6 +273,13 @@ struct svc_xprt { int32_t xp_refcnt; /* handle reference count */ uint16_t xp_flags; /* flags */ + + union { + struct in_pktinfo in; +#ifdef INET6 + struct in6_pktinfo in6; +#endif + } xp_pktinfo; }; /* Service record used by exported search routines */ diff --git a/src/svc_dg.c b/src/svc_dg.c index 156d5ff5f7..37c56937cc 100644 --- a/src/svc_dg.c +++ b/src/svc_dg.c @@ -382,8 +382,10 @@ svc_dg_reply(struct svc_req *req) XDR *xdrs = rec->ioq.xdrs; struct svc_dg_xprt *su = DG_DR(rec); struct msghdr *msg = &su->su_msghdr; + struct cmsghdr* cmsg; struct iovec iov; size_t slen; + char msg_control[sizeof(struct cmsghdr) + sizeof(struct in6_pktinfo)]; if (!xprt->xp_remote.nb.len) { __warnx(TIRPC_DEBUG_FLAG_WARN, @@ -415,13 +417,33 @@ svc_dg_reply(struct svc_req *req) msg->msg_iov = &iov; msg->msg_iovlen = 1; msg->msg_name = (struct sockaddr *)&xprt->xp_remote.ss; - msg->msg_namelen = xprt->xp_remote.nb.len; - /* cmsg already set in svc_dg_rendezvous */ + msg->msg_namelen = sizeof(struct sockaddr_storage); + msg->msg_control = msg_control; + msg->msg_controllen = sizeof(msg_control); + msg->msg_flags = 0; + + cmsg = CMSG_FIRSTHDR(msg); + cmsg->cmsg_level = (xprt->xp_local.ss.ss_family == AF_INET) + ? IPPROTO_IP : IPPROTO_IPV6; /* a.k.a. SOL_IP and SOL_IPV6 */ + cmsg->cmsg_type = (xprt->xp_local.ss.ss_family == AF_INET) + ? IP_PKTINFO : IPV6_PKTINFO; + if (xprt->xp_local.ss.ss_family == AF_INET) + *(struct in_pktinfo*)CMSG_DATA(cmsg) = + *(struct in_pktinfo*) &xprt->xp_pktinfo; + else + *(struct in6_pktinfo*)CMSG_DATA(cmsg) = + *(struct in6_pktinfo*) &xprt->xp_pktinfo; + cmsg->cmsg_len = (xprt->xp_local.ss.ss_family == AF_INET) + ? CMSG_LEN(sizeof(struct in_pktinfo)) + : CMSG_LEN(sizeof(struct in6_pktinfo)); + msg->msg_controllen = (xprt->xp_local.ss.ss_family == AF_INET) + ? CMSG_SPACE(sizeof(struct in_pktinfo)) + : CMSG_SPACE(sizeof(struct in6_pktinfo)); if (sendmsg(xprt->xp_fd, msg, 0) != (ssize_t) slen) { __warnx(TIRPC_DEBUG_FLAG_ERROR, - "%s: %p fd %d sendmsg failed (will set dead)", - __func__, xprt, xprt->xp_fd); + "%s: %p fd %d err %d sendmsg failed (will set dead)", + __func__, xprt, xprt->xp_fd, errno); return (XPRT_DIED); } @@ -585,22 +607,24 @@ svc_dg_rendezvous_ops(SVCXPRT *xprt) void svc_dg_enable_pktinfo(int fd, const struct __rpc_sockinfo *si) { - int val = 1; + int on = 1, off = 0; switch (si->si_af) { case AF_INET: #ifdef SOL_IP - (void)setsockopt(fd, SOL_IP, IP_PKTINFO, &val, sizeof(val)); + (void)setsockopt(fd, SOL_IP, IP_PKTINFO, &on, sizeof(on)); #endif break; case AF_INET6: #ifdef SOL_IP - (void)setsockopt(fd, SOL_IP, IP_PKTINFO, &val, sizeof(val)); + (void)setsockopt(fd, SOL_IP, IP_PKTINFO, &on, sizeof(on)); #endif #ifdef SOL_IPV6 (void)setsockopt(fd, SOL_IPV6, IPV6_RECVPKTINFO, - &val, sizeof(val)); + &on, sizeof(on)); + (void)setsockopt(fd, SOL_IPV6, IPV6_V6ONLY, + &off, sizeof(off)); #endif break; } @@ -612,22 +636,13 @@ svc_dg_store_in_pktinfo(struct cmsghdr *cmsg, SVCXPRT *xprt) if (cmsg->cmsg_level == SOL_IP && cmsg->cmsg_type == IP_PKTINFO && cmsg->cmsg_len >= CMSG_LEN(sizeof(struct in_pktinfo))) { - struct in_pktinfo *pkti = (struct in_pktinfo *) - CMSG_DATA(cmsg); - struct sockaddr_in *daddr = (struct sockaddr_in *) - &xprt->xp_local.ss; - - daddr->sin_family = AF_INET; -#ifdef __FreeBSD__ - daddr->sin_addr = pkti->ipi_addr; -#else - daddr->sin_addr.s_addr = pkti->ipi_spec_dst.s_addr; -#endif + xprt->xp_pktinfo.in = *(struct in_pktinfo *) CMSG_DATA(cmsg); + xprt->xp_local.ss.ss_family = AF_INET; + xprt->xp_local.nb.buf = &xprt->xp_pktinfo; xprt->xp_local.nb.len = sizeof(struct sockaddr_in); return 1; - } else { - return 0; } + return 0; } static int @@ -636,19 +651,14 @@ svc_dg_store_in6_pktinfo(struct cmsghdr *cmsg, SVCXPRT *xprt) if (cmsg->cmsg_level == SOL_IPV6 && cmsg->cmsg_type == IPV6_PKTINFO && cmsg->cmsg_len >= CMSG_LEN(sizeof(struct in6_pktinfo))) { - struct in6_pktinfo *pkti = (struct in6_pktinfo *) - CMSG_DATA(cmsg); - struct sockaddr_in6 *daddr = (struct sockaddr_in6 *) - &xprt->xp_local.ss; - - daddr->sin6_family = AF_INET6; - daddr->sin6_addr = pkti->ipi6_addr; - daddr->sin6_scope_id = pkti->ipi6_ifindex; + + xprt->xp_pktinfo.in6 = *(struct in6_pktinfo *) CMSG_DATA(cmsg); + xprt->xp_local.ss.ss_family = AF_INET6; + xprt->xp_local.nb.buf = &xprt->xp_pktinfo; xprt->xp_local.nb.len = sizeof(struct sockaddr_in6); return 1; - } else { - return 0; } + return 0; } /* @@ -667,32 +677,25 @@ svc_dg_store_pktinfo(struct msghdr *msg, SVCXPRT *xprt) if (msg->msg_flags & MSG_CTRUNC) return 0; - cmsg = CMSG_FIRSTHDR(msg); - if (cmsg == NULL || CMSG_NXTHDR(msg, cmsg) != NULL) - return 0; + for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL; + cmsg = CMSG_NXTHDR(msg, cmsg)) { - switch (((struct sockaddr *)msg->msg_name)->sa_family) { - case AF_INET: -#ifdef SOL_IP - if (svc_dg_store_in_pktinfo(cmsg, xprt)) - return 1; + if (cmsg->cmsg_level == IPPROTO_IP) { +#ifdef IP_PKTINFO + if (cmsg->cmsg_type == IP_PKTINFO) { + if (svc_dg_store_in_pktinfo(cmsg, xprt)) + return 1; + } #endif - break; - case AF_INET6: -#ifdef SOL_IP - /* Handle IPv4 PKTINFO as well on IPV6 interface */ - if (svc_dg_store_in_pktinfo(cmsg, xprt)) - return 1; +#ifdef IPV6_PKTINFO + if (cmsg->cmsg_type == IPV6_PKTINFO) { + if (svc_dg_store_in6_pktinfo(cmsg, xprt)) + return 1; + } #endif -#ifdef SOL_IPV6 - if (svc_dg_store_in6_pktinfo(cmsg, xprt)) - return 1; -#endif - break; - - default: - break; + } + } return 0; diff --git a/src/svc_internal.h b/src/svc_internal.h index c51e437fc8..01279521fc 100644 --- a/src/svc_internal.h +++ b/src/svc_internal.h @@ -98,10 +98,11 @@ union pktinfo_u { * Replaces old struct svc_dg_data by locally wrapping struct rpc_dplx_rec, * which wraps struct svc_xprt indexed by fd. */ +#define DG_NUM_PKTINFO 4 /* s/b enough space for all pktinfos in normal case*/ struct svc_dg_xprt { struct rpc_dplx_rec su_dr; /* SVCXPRT indexed by fd */ struct msghdr su_msghdr; /* msghdr received from clnt */ - unsigned char su_cmsg[SVC_CMSG_SIZE]; /* cmsghdr received from clnt */ + union pktinfo_u su_cmsg[DG_NUM_PKTINFO]; /* cmsghdr recv'd from clnt */ }; #define DG_DR(p) (opr_containerof((p), struct svc_dg_xprt, su_dr)) #define su_data(xprt) (DG_DR(REC_XPRT(xprt))) From c513a16c2634934cbe4c052a749b6820e726072a Mon Sep 17 00:00:00 2001 From: TweakySolution Date: Thu, 12 Dec 2019 10:06:16 -0600 Subject: [PATCH 56/70] Non-Blocking I/O results in hang. When testing asynchronous I/O, observed all the work threads eventually waiting on the same epoll event. This was because the svc worker was being rescheduled multiple times when EWOULDBLOCK occurs on send and we request evchan write (OUT) control on the fd_send. Only one svc_rqst_epoll_loop should be running per sr_rec. Signed-off-by: TweakySolution --- src/svc_rqst.c | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/src/svc_rqst.c b/src/svc_rqst.c index a90fcf8581..ad6d10b8cd 100644 --- a/src/svc_rqst.c +++ b/src/svc_rqst.c @@ -978,7 +978,7 @@ svc_rqst_evchan_write(SVCXPRT *xprt, struct xdr_ioq *xioq, bool has_blocked) __warnx(TIRPC_DEBUG_FLAG_ERROR, "%s: failed duplicating fd (%d)", __func__, code); - goto out; + return (code); } __warnx(TIRPC_DEBUG_FLAG_SVC_RQST, @@ -1001,25 +1001,16 @@ svc_rqst_evchan_write(SVCXPRT *xprt, struct xdr_ioq *xioq, bool has_blocked) SVC_XPRT_FLAG_ADDED_SEND); } - if (code) { + if (unlikely(code)) { __warnx(TIRPC_DEBUG_FLAG_ERROR, - "%s: failed hooking events (%d)", + "%s: failed evchan write control (%d)", __func__, code); - goto out; - } - - atomic_inc_int32_t(&sr_rec->ev_refcnt); - work_pool_submit(&svc_work_pool, &sr_rec->ev_wpe); - - __warnx(TIRPC_DEBUG_FLAG_SVC_RQST, - "%s: create evchan write control fd pair (%d:%d)", - __func__, - sr_rec->sv[0], sr_rec->sv[1]); - -out: - - if (code != 0) { - svc_rqst_release(sr_rec); + } else { + __warnx(TIRPC_DEBUG_FLAG_SVC_RQST, + "%s: create evchan write control fd pair (%d:%d)", + __func__, + sr_rec->sv[0], sr_rec->sv[1]); + } } rpc_dplx_rui(rec); From 7590594961e7605703d55fa1aaa8f77742a49881 Mon Sep 17 00:00:00 2001 From: TweakySolution Date: Fri, 13 Dec 2019 08:21:13 -0600 Subject: [PATCH 57/70] fix erronous bracket --- src/svc_rqst.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/svc_rqst.c b/src/svc_rqst.c index ad6d10b8cd..360f6b9350 100644 --- a/src/svc_rqst.c +++ b/src/svc_rqst.c @@ -1010,7 +1010,6 @@ svc_rqst_evchan_write(SVCXPRT *xprt, struct xdr_ioq *xioq, bool has_blocked) "%s: create evchan write control fd pair (%d:%d)", __func__, sr_rec->sv[0], sr_rec->sv[1]); - } } rpc_dplx_rui(rec); From 2a14f3d6579c911ba0d09dcefa755f295463ee2f Mon Sep 17 00:00:00 2001 From: Daniel Gryniewicz Date: Fri, 13 Dec 2019 14:57:41 -0500 Subject: [PATCH 58/70] 3.1 Signed-off-by: Daniel Gryniewicz --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6325ad7818..d8660d88e6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,7 +15,7 @@ project(NTIRPC C) # version numbers set(NTIRPC_MAJOR_VERSION 3) # This is .0 for a release, .N for a stable branch, blank for development -set(NTIRPC_MINOR_VERSION .0) +set(NTIRPC_MINOR_VERSION .1) # -something for dev releases set(NTIRPC_VERSION_EXTRA ) set(VERSION_COMMENT From 1ff6036667afae1e8c87b7cae4aa2cc193372527 Mon Sep 17 00:00:00 2001 From: Daniel Gryniewicz Date: Mon, 16 Dec 2019 11:14:51 -0500 Subject: [PATCH 59/70] 3.2 Signed-off-by: Daniel Gryniewicz --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d8660d88e6..e340e6625f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,7 +15,7 @@ project(NTIRPC C) # version numbers set(NTIRPC_MAJOR_VERSION 3) # This is .0 for a release, .N for a stable branch, blank for development -set(NTIRPC_MINOR_VERSION .1) +set(NTIRPC_MINOR_VERSION .2) # -something for dev releases set(NTIRPC_VERSION_EXTRA ) set(VERSION_COMMENT From eed2cc7999d3f5ad1c1e512706622fde1c05713e Mon Sep 17 00:00:00 2001 From: Daniel Gryniewicz Date: Thu, 20 Feb 2020 11:52:05 -0500 Subject: [PATCH 60/70] XPRT - Unregister xprt before dropping sentinal ref XPRTs are generally stored in some form of lookup table. When we're destroying the XPRT, we need to remove it from it's lookup table before we drop the sentinal ref. Otherwise, another thread can find it, and increment it's ref from 0 -> 1. This is very hard to handle in an atomic fashion. Add an unlink() API method, and implement it for all backends. This allows us to remove the XPRT from it's lookup table before we drop it's sentinal ref. Signed-off-by: Daniel Gryniewicz --- ntirpc/rpc/svc.h | 23 ++++++++++++++--------- src/rpc_rdma.c | 7 +++++++ src/svc_dg.c | 24 ++++++++++++------------ src/svc_raw.c | 6 ++++++ src/svc_vc.c | 18 +++++++++--------- 5 files changed, 48 insertions(+), 30 deletions(-) diff --git a/ntirpc/rpc/svc.h b/ntirpc/rpc/svc.h index 0d8c210c42..51df032735 100644 --- a/ntirpc/rpc/svc.h +++ b/ntirpc/rpc/svc.h @@ -204,33 +204,35 @@ struct svc_req; /* forward decl. */ typedef enum xprt_stat (*svc_req_fun_t) (struct svc_req *); -/* +/** * Server side transport handle */ struct svc_xprt { struct xp_ops { - /* receive incoming requests */ + /** receive incoming requests */ svc_xprt_fun_t xp_recv; - /* get transport status */ + /** get transport status */ svc_xprt_fun_t xp_stat; - /* decode incoming message header (called by request_cb) */ + /** decode incoming message header (called by request_cb) */ svc_req_fun_t xp_decode; - /* send reply */ + /** send reply */ svc_req_fun_t xp_reply; - /* optional checksum (after authentication/decryption) */ + /** optional checksum (after authentication/decryption) */ void (*xp_checksum) (struct svc_req *, void *, size_t); - /* actually destroy after xp_destroy_it and xp_release_it */ + /** Unlink xprt from it's lookup table. */ + void (*xp_unlink) (SVCXPRT *, u_int, const char *, const int); + /** actually destroy after xp_destroy_it and xp_release_it */ void (*xp_destroy) (SVCXPRT *, u_int, const char *, const int); - /* catch-all function */ + /** catch-all function */ bool (*xp_control) (SVCXPRT *, const u_int, void *); - /* free client user data */ + /** free client user data */ svc_xprt_fun_t xp_free_user_data; } *xp_ops; @@ -476,6 +478,9 @@ static inline void svc_destroy_it(SVCXPRT *xprt, return; } + /* unlink before dropping last ref */ + (*(xprt)->xp_ops->xp_unlink)(xprt, flags, tag, line); + svc_release_it(xprt, SVC_RELEASE_FLAG_NONE, tag, line); } #define SVC_DESTROY(xprt) \ diff --git a/src/rpc_rdma.c b/src/rpc_rdma.c index 3e9f458eb3..b97a5f6261 100644 --- a/src/rpc_rdma.c +++ b/src/rpc_rdma.c @@ -2206,6 +2206,12 @@ rpc_rdma_connect(RDMAXPRT *xprt) rpc_rdma_state.cm_epollfd); } +static void +rpc_rdma_unlink_it(SVCXPRT *xprt, u_int flags, const char *tag, const int line) +{ + return; +} + static void rpc_rdma_destroy_it(SVCXPRT *xprt, u_int flags, const char *tag, const int line) { @@ -2245,6 +2251,7 @@ static struct xp_ops rpc_rdma_ops = { .xp_decode = (svc_req_fun_t)abort, .xp_reply = (svc_req_fun_t)abort, .xp_checksum = NULL, /* not used */ + .xp_unlink = rpc_rdma_unlink_it, .xp_destroy = rpc_rdma_destroy_it, .xp_control = rpc_rdma_control, .xp_free_user_data = NULL, /* no default */ diff --git a/src/svc_dg.c b/src/svc_dg.c index 37c56937cc..6ab1f9f489 100644 --- a/src/svc_dg.c +++ b/src/svc_dg.c @@ -490,6 +490,15 @@ svc_dg_destroy_task(struct work_pool_entry *wpe) svc_dg_xprt_free(DG_DR(rec)); } +static void +svc_dg_unlink_it(SVCXPRT *xprt, u_int flags, const char *tag, const int line) +{ + if (!xprt->xp_parent) { + /* only original parent is registered */ + svc_rqst_xprt_unregister(xprt, flags); + } +} + static void svc_dg_destroy_it(SVCXPRT *xprt, u_int flags, const char *tag, const int line) { @@ -498,11 +507,6 @@ svc_dg_destroy_it(SVCXPRT *xprt, u_int flags, const char *tag, const int line) .tv_nsec = 0, }; - if (!xprt->xp_parent) { - /* only original parent is registered */ - svc_rqst_xprt_unregister(xprt, flags); - } - __warnx(TIRPC_DEBUG_FLAG_REFCNT, "%s() %p fd %d xp_refcnt %" PRId32 " @%s:%d", __func__, xprt, xprt->xp_fd, xprt->xp_refcnt, tag, line); @@ -517,12 +521,6 @@ svc_dg_destroy_it(SVCXPRT *xprt, u_int flags, const char *tag, const int line) work_pool_submit(&svc_work_pool, &(REC_XPRT(xprt)->ioq.ioq_wpe)); } -static void -svc_dg_destroy(SVCXPRT *xprt, u_int flags, const char *tag, const int line) -{ - svc_dg_destroy_it(xprt, flags, tag, line); -} - extern mutex_t ops_lock; /*ARGSUSED*/ @@ -569,7 +567,8 @@ svc_dg_override_ops(SVCXPRT *xprt, SVCXPRT *rendezvous) ops.xp_decode = svc_dg_decode; ops.xp_reply = svc_dg_reply; ops.xp_checksum = svc_dg_checksum; - ops.xp_destroy = svc_dg_destroy; + ops.xp_unlink = svc_dg_unlink_it; + ops.xp_destroy = svc_dg_destroy_it; ops.xp_control = svc_dg_control; ops.xp_free_user_data = NULL; /* no default */ } @@ -593,6 +592,7 @@ svc_dg_rendezvous_ops(SVCXPRT *xprt) ops.xp_decode = (svc_req_fun_t)abort; ops.xp_reply = (svc_req_fun_t)abort; ops.xp_checksum = NULL; /* not used */ + ops.xp_unlink = svc_dg_unlink_it; ops.xp_destroy = svc_dg_destroy_it; ops.xp_control = svc_dg_control; ops.xp_free_user_data = NULL; /* no default */ diff --git a/src/svc_raw.c b/src/svc_raw.c index 8519ba5e99..992b825729 100644 --- a/src/svc_raw.c +++ b/src/svc_raw.c @@ -185,6 +185,11 @@ svc_raw_reply(struct svc_req *req) } /*ARGSUSED*/ +static void +svc_raw_unlink(SVCXPRT *xprt, u_int flags, const char *tag, const int line) +{ +} + static void svc_raw_destroy(SVCXPRT *xprt, u_int flags, const char *tag, const int line) { @@ -212,6 +217,7 @@ svc_raw_ops(SVCXPRT *xprt) ops.xp_decode = svc_raw_decode; ops.xp_reply = svc_raw_reply; ops.xp_checksum = NULL; /* optional */ + ops.xp_unlink = svc_raw_unlink; ops.xp_destroy = svc_raw_destroy; ops.xp_control = svc_raw_control; ops.xp_free_user_data = NULL; /* no default */ diff --git a/src/svc_vc.c b/src/svc_vc.c index 38cc99f1bd..4a1583c4f5 100644 --- a/src/svc_vc.c +++ b/src/svc_vc.c @@ -562,6 +562,12 @@ svc_vc_destroy_task(struct work_pool_entry *wpe) svc_vc_xprt_free(VC_DR(rec)); } +static void +svc_vc_unlink_it(SVCXPRT *xprt, u_int flags, const char *tag, const int line) +{ + svc_rqst_xprt_unregister(xprt, flags); +} + static void svc_vc_destroy_it(SVCXPRT *xprt, u_int flags, const char *tag, const int line) { @@ -570,8 +576,6 @@ svc_vc_destroy_it(SVCXPRT *xprt, u_int flags, const char *tag, const int line) .tv_nsec = 0, }; - svc_rqst_xprt_unregister(xprt, flags); - __warnx(TIRPC_DEBUG_FLAG_REFCNT, "%s() %p fd %d xp_refcnt %" PRId32 " @%s:%d", __func__, xprt, xprt->xp_fd, xprt->xp_refcnt, tag, line); @@ -586,12 +590,6 @@ svc_vc_destroy_it(SVCXPRT *xprt, u_int flags, const char *tag, const int line) work_pool_submit(&svc_work_pool, &(REC_XPRT(xprt)->ioq.ioq_wpe)); } -static void -svc_vc_destroy(SVCXPRT *xprt, u_int flags, const char *tag, const int line) -{ - svc_vc_destroy_it(xprt, flags, tag, line); -} - extern mutex_t ops_lock; /*ARGSUSED*/ @@ -981,7 +979,8 @@ svc_vc_override_ops(SVCXPRT *xprt, SVCXPRT *rendezvous) ops.xp_decode = svc_vc_decode; ops.xp_reply = svc_vc_reply; ops.xp_checksum = svc_vc_checksum; - ops.xp_destroy = svc_vc_destroy; + ops.xp_unlink = svc_vc_unlink_it; + ops.xp_destroy = svc_vc_destroy_it; ops.xp_control = svc_vc_control; ops.xp_free_user_data = NULL; /* no default */ } @@ -1006,6 +1005,7 @@ svc_vc_rendezvous_ops(SVCXPRT *xprt) ops.xp_decode = (svc_req_fun_t)abort; ops.xp_reply = (svc_req_fun_t)abort; ops.xp_checksum = NULL; /* not used */ + ops.xp_unlink = svc_vc_unlink_it; ops.xp_destroy = svc_vc_destroy_it; ops.xp_control = svc_vc_rendezvous_control; ops.xp_free_user_data = NULL; /* no default */ From c7239b57d6e3e84d6bf59f10d5bcad2d28d6f1a2 Mon Sep 17 00:00:00 2001 From: Daniel Gryniewicz Date: Fri, 28 Feb 2020 14:52:09 -0500 Subject: [PATCH 61/70] XPRT - Simplify refcounting There is a race when unhooking events from epoll, where the event could be ready for delivery (or even delivered, but the thread not scheduled) and so the event is processed after the unhook, and therefore after the XPRT has been freed. To close this, stop putting a pointer to the rec in the event data, and instead put the FD in there and use it to look up the XPRT. This ensures that, if we got the XPRT from lookup, it's valid and ref'd for the duration of the event. Once we're no longer storing a XPRT pointer in the epoll event, we don't need a refcount across the hook/event/unhook series. Remove these refcounts, allowing a destroyed XPRT to just be freed. Signed-off-by: Daniel Gryniewicz --- src/svc_dg.c | 3 ++ src/svc_internal.h | 1 + src/svc_rqst.c | 84 ++++++++++++++++++++++++++-------------------- src/svc_vc.c | 5 +++ src/svc_xprt.c | 7 ++-- 5 files changed, 60 insertions(+), 40 deletions(-) diff --git a/src/svc_dg.c b/src/svc_dg.c index 6ab1f9f489..ea969fe604 100644 --- a/src/svc_dg.c +++ b/src/svc_dg.c @@ -496,6 +496,9 @@ svc_dg_unlink_it(SVCXPRT *xprt, u_int flags, const char *tag, const int line) if (!xprt->xp_parent) { /* only original parent is registered */ svc_rqst_xprt_unregister(xprt, flags); + } else { + /* Still need to unhook it */ + svc_rqst_unhook(xprt); } } diff --git a/src/svc_internal.h b/src/svc_internal.h index 01279521fc..682e291bf7 100644 --- a/src/svc_internal.h +++ b/src/svc_internal.h @@ -173,5 +173,6 @@ int svc_rqst_xprt_register(SVCXPRT *, SVCXPRT *); void svc_rqst_xprt_unregister(SVCXPRT *, uint32_t); int svc_rqst_evchan_write(SVCXPRT *, struct xdr_ioq *, bool); void svc_rqst_xprt_send_complete(SVCXPRT *); +void svc_rqst_unhook(SVCXPRT *); #endif /* TIRPC_SVC_INTERNAL_H */ diff --git a/src/svc_rqst.c b/src/svc_rqst.c index e2662d849f..e6016683b9 100644 --- a/src/svc_rqst.c +++ b/src/svc_rqst.c @@ -665,9 +665,8 @@ svc_rqst_rearm_events_locked(SVCXPRT *xprt, uint16_t ev_flags) if (sr_rec->ev_flags & SVC_RQST_FLAG_SHUTDOWN) return (0); - /* Currently, can only be called with one of ADDED_RECV or ADDED_SEND, so we - * only need to take one ref. */ - SVC_REF(xprt, SVC_REF_FLAG_NONE); + /* Don't take a ref on the xprt. We take a ref in hook, and release it + * in unhook. */ /* assuming success */ atomic_set_uint16_t_bits(&xprt->xp_flags, ev_flags); @@ -736,10 +735,10 @@ svc_rqst_rearm_events_locked(SVCXPRT *xprt, uint16_t ev_flags) &xprt->xp_flags, SVC_XPRT_FLAG_ADDED_SEND); __warnx(TIRPC_DEBUG_FLAG_ERROR, - "%s: %p fd %d xp_refcnt %" PRId32 + "%s: %p fd_send %d xp_refcnt %" PRId32 " sr_rec %p evchan %d ev_refcnt %" PRId32 " epoll_fd %d control fd pair (%d:%d) rearm failed (%d)", - __func__, rec, rec->xprt.xp_fd, + __func__, rec, rec->xprt.xp_fd_send, rec->xprt.xp_refcnt, sr_rec, sr_rec->id_k, sr_rec->ev_refcnt, sr_rec->ev_u.epoll.epoll_fd, @@ -748,10 +747,10 @@ svc_rqst_rearm_events_locked(SVCXPRT *xprt, uint16_t ev_flags) } else { __warnx(TIRPC_DEBUG_FLAG_SVC_RQST | TIRPC_DEBUG_FLAG_REFCNT, - "%s: %p fd %d xp_refcnt %" PRId32 + "%s: %p fd_send %d xp_refcnt %" PRId32 " sr_rec %p evchan %d ev_refcnt %"PRId32 " epoll_fd %d control fd pair (%d:%d) rearm event %p", - __func__, rec, rec->xprt.xp_fd, + __func__, rec, rec->xprt.xp_fd_send, rec->xprt.xp_refcnt, sr_rec, sr_rec->id_k, sr_rec->ev_refcnt, sr_rec->ev_u.epoll.epoll_fd, @@ -806,11 +805,16 @@ svc_rqst_hook_events(struct rpc_dplx_rec *rec, struct svc_rqst_rec *sr_rec, { struct epoll_event *ev; + /* For epoll, we no longer need a ref on the xprt. epoll uses + * the FD as a key now, and the xprt is looked up, which gets a + * ref for the event. The xprt can therefore be freed while in + * epoll, with no consequences. */ + if (ev_flags & SVC_XPRT_FLAG_ADDED_RECV) { ev = &rec->ev_u.epoll.event_recv; /* set up epoll user data */ - ev->data.ptr = rec; + ev->data.fd = rec->xprt.xp_fd; /* wait for read events, level triggered, oneshot */ ev->events = EPOLLONESHOT | EPOLLIN; @@ -849,8 +853,8 @@ svc_rqst_hook_events(struct rpc_dplx_rec *rec, struct svc_rqst_rec *sr_rec, if (ev_flags & SVC_XPRT_FLAG_ADDED_SEND) { ev = &rec->ev_u.epoll.event_send; - /* set up epoll user data */ - ev->data.ptr = rec; + /* set up epoll user data. Lookup needs the primary FD */ + ev->data.fd = rec->xprt.xp_fd; /* wait for write events, edge triggered, oneshot */ ev->events = EPOLLONESHOT | EPOLLOUT | EPOLLET; @@ -865,10 +869,10 @@ svc_rqst_hook_events(struct rpc_dplx_rec *rec, struct svc_rqst_rec *sr_rec, &rec->xprt.xp_flags, SVC_XPRT_FLAG_ADDED_SEND); __warnx(TIRPC_DEBUG_FLAG_ERROR, - "%s: %p fd %d xp_refcnt %" PRId32 + "%s: %p fd_send %d xp_refcnt %" PRId32 " sr_rec %p evchan %d ev_refcnt %" PRId32 " epoll_fd %d control fd pair (%d:%d) direction out hook failed (%d)", - __func__, rec, rec->xprt.xp_fd, + __func__, rec, rec->xprt.xp_fd_send, rec->xprt.xp_refcnt, sr_rec, sr_rec->id_k, sr_rec->ev_refcnt, sr_rec->ev_u.epoll.epoll_fd, @@ -876,10 +880,10 @@ svc_rqst_hook_events(struct rpc_dplx_rec *rec, struct svc_rqst_rec *sr_rec, } else { __warnx(TIRPC_DEBUG_FLAG_SVC_RQST | TIRPC_DEBUG_FLAG_REFCNT, - "%s: %p fd %d xp_refcnt %" PRId32 + "%s: %p fd_send %d xp_refcnt %" PRId32 " sr_rec %p evchan %d ev_refcnt %" PRId32 " epoll_fd %d control fd pair (%d:%d) direction out hook event %p", - __func__, rec, rec->xprt.xp_fd, + __func__, rec, rec->xprt.xp_fd_send, rec->xprt.xp_refcnt, sr_rec, sr_rec->id_k, sr_rec->ev_refcnt, sr_rec->ev_u.epoll.epoll_fd, @@ -904,20 +908,28 @@ svc_rqst_hook_events(struct rpc_dplx_rec *rec, struct svc_rqst_rec *sr_rec, return (code); } -/* - * RPC_DPLX_LOCKED - */ -static void -svc_rqst_unreg(struct rpc_dplx_rec *rec, struct svc_rqst_rec *sr_rec) +void +svc_rqst_unhook(SVCXPRT *xprt) { + struct rpc_dplx_rec *rec = REC_XPRT(xprt); + struct svc_rqst_rec *sr_rec = (struct svc_rqst_rec *)rec->ev_p; uint16_t xp_flags = - atomic_postclear_uint16_t_bits(&rec->xprt.xp_flags, + atomic_postclear_uint16_t_bits(&xprt->xp_flags, SVC_XPRT_FLAG_ADDED_RECV | SVC_XPRT_FLAG_ADDED_SEND); /* clear events */ if (xp_flags & (SVC_XPRT_FLAG_ADDED_RECV | SVC_XPRT_FLAG_ADDED_SEND)) (void)svc_rqst_unhook_events(rec, sr_rec, xp_flags); +} + +/* + * RPC_DPLX_LOCKED + */ +static void +svc_rqst_unreg(struct rpc_dplx_rec *rec, struct svc_rqst_rec *sr_rec) +{ + svc_rqst_unhook(&rec->xprt); /* Unlinking after debug message ensures both the xprt and the sr_rec * are still present, as the xprt unregisters before release. @@ -982,7 +994,7 @@ svc_rqst_evchan_write(SVCXPRT *xprt, struct xdr_ioq *xioq, bool has_blocked) } __warnx(TIRPC_DEBUG_FLAG_SVC_RQST, - "%s: xp_fd_send fd %d dup of xp_fd %d", + "%s: xp_fd_send %d dup of xp_fd %d", __func__, xprt->xp_fd_send, xprt->xp_fd); } } @@ -995,8 +1007,6 @@ svc_rqst_evchan_write(SVCXPRT *xprt, struct xdr_ioq *xioq, bool has_blocked) code = svc_rqst_rearm_events_locked(xprt, SVC_XPRT_FLAG_ADDED_SEND); } else { - /* svc_rqst_hook_events doesn't take a ref, so take one here */ - SVC_REF(xprt, SVC_REF_FLAG_NONE); code = svc_rqst_hook_events(rec, sr_rec, SVC_XPRT_FLAG_ADDED_SEND); } @@ -1156,7 +1166,6 @@ svc_rqst_xprt_task_recv(struct work_pool_entry *wpe) struct xdr_ioq *ioq = opr_containerof(wpe, struct xdr_ioq, ioq_wpe); struct rpc_dplx_rec *rec = ioq->rec; - enum xprt_stat stat = XPRT_IDLE; atomic_clear_uint16_t_bits(&ioq->ioq_s.qflags, IOQ_FLAG_WORKING); @@ -1174,13 +1183,11 @@ svc_rqst_xprt_task_recv(struct work_pool_entry *wpe) * xp_refcnt need more than 1 (this task). */ (void)clock_gettime(CLOCK_MONOTONIC_FAST, &rec->recv.ts); - stat = SVC_RECV(&rec->xprt); + (void)SVC_RECV(&rec->xprt); } - if (stat != XPRT_SUSPEND) { - /* If tests fail, log non-fatal "WARNING! already destroying!" */ - SVC_RELEASE(&rec->xprt, SVC_RELEASE_FLAG_NONE); - } + /* Release the ref taken on the event */ + SVC_RELEASE(&rec->xprt, SVC_RELEASE_FLAG_NONE); } enum xprt_stat svc_request(SVCXPRT *xprt, XDR *xdrs) @@ -1274,8 +1281,6 @@ svc_rqst_xprt_task_send(struct work_pool_entry *wpe) svc_ioq_write(&rec->xprt); } - /* If tests fail, log non-fatal "WARNING! already destroying!" - */ SVC_RELEASE(&rec->xprt, SVC_RELEASE_FLAG_NONE); } @@ -1351,7 +1356,8 @@ svc_rqst_clean_idle(int timeout) static struct xdr_ioq * svc_rqst_epoll_event(struct svc_rqst_rec *sr_rec, struct epoll_event *ev) { - struct rpc_dplx_rec *rec = (struct rpc_dplx_rec *) ev->data.ptr; + SVCXPRT *xprt; + struct rpc_dplx_rec *rec; uint16_t xp_flags, ev_flag = 0; struct xdr_ioq *ioq = NULL; work_pool_fun_t fun; @@ -1371,11 +1377,15 @@ svc_rqst_epoll_event(struct svc_rqst_rec *sr_rec, struct epoll_event *ev) return (NULL); } - /* Another task may release transport in parallel. - * We have a ref from being in epoll, but since epoll is one-shot, a new ref - * will be taken when we re-enter epoll. Use this ref for the processor - * without taking another one. - */ + xprt = svc_xprt_lookup(ev->data.fd, NULL); + if (!xprt) { + __warnx(TIRPC_DEBUG_FLAG_SVC_RQST, + "%s: fd %d no associated xprt", + __func__, ev->data.fd); + return (NULL); + } + /* At this point, we have a ref on the xprt, and know it's valid */ + rec = REC_XPRT(xprt); __warnx(TIRPC_DEBUG_FLAG_SVC_RQST, "%s: event %p %08x%s%s rpc_dplx_rec %p (sr_rec %p)", diff --git a/src/svc_vc.c b/src/svc_vc.c index 4a1583c4f5..bec4b6e389 100644 --- a/src/svc_vc.c +++ b/src/svc_vc.c @@ -515,8 +515,13 @@ svc_vc_rendezvous(SVCXPRT *xprt) SVC_DESTROY(newxprt); /* Was never added to epoll */ SVC_RELEASE(newxprt, SVC_RELEASE_FLAG_NONE); + SVC_RELEASE(xprt, SVC_RELEASE_FLAG_NONE); return (XPRT_DESTROYED); } + + /* We're not using a ref for the hook anymore, since epoll doesn't store + * the transport pointer. Drop the extra ref here. */ + SVC_RELEASE(newxprt, SVC_RELEASE_FLAG_NONE); return (XPRT_IDLE); } diff --git a/src/svc_xprt.c b/src/svc_xprt.c index d0bc832121..8581d10eed 100644 --- a/src/svc_xprt.c +++ b/src/svc_xprt.c @@ -209,13 +209,14 @@ svc_xprt_lookup(int fd, svc_xprt_setup_t setup) rpc_dplx_rli(rec); xp_flags = atomic_clear_uint16_t_bits(&xprt->xp_flags, SVC_XPRT_FLAG_INITIAL); + rpc_dplx_rui(rec); + if (!(xp_flags & SVC_XPRT_FLAG_DESTROYED)) { /* do not return destroyed xprts */ return (xprt); } /* unlock before release permits releasing here after destroy */ - rpc_dplx_rui(rec); SVC_RELEASE(xprt, SVC_RELEASE_FLAG_NONE); return (NULL); } @@ -391,9 +392,9 @@ void svc_xprt_trace(SVCXPRT *xprt, const char *func, const char *tag, const int line) { __warnx(TIRPC_DEBUG_FLAG_REFCNT, - "%s() %p fd %d xp_refcnt %" PRId32 + "%s() %p fd %d fd_send %d xp_refcnt %" PRId32 " af %u port %u @%s:%d", - func, xprt, xprt->xp_fd, xprt->xp_refcnt, + func, xprt, xprt->xp_fd, xprt->xp_fd_send, xprt->xp_refcnt, xprt->xp_remote.ss.ss_family, __rpc_address_port(&xprt->xp_remote), tag, line); From 3ce0a91abf39b7ba13aa904a52b4757bdc077b31 Mon Sep 17 00:00:00 2001 From: Dominique Martinet Date: Wed, 1 Apr 2020 09:49:44 +0200 Subject: [PATCH 62/70] authgss_prot: silence false-positive clang warning clang complains about uninitialized use of xv/gv_count variables because there is no final else clause, even if we do check that svc is either of these types at the start of the function. Silence false positive by skipping the last "always-true" check --- src/authgss_prot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/authgss_prot.c b/src/authgss_prot.c index b01578f452..23f2755284 100644 --- a/src/authgss_prot.c +++ b/src/authgss_prot.c @@ -305,7 +305,7 @@ xdr_rpc_gss_wrap(XDR *xdrs, xdrproc_t xdr_func, void *xdr_ptr, xv_count = data_count + 2; gv_count = data_count + 1; after_data = data_count; - } else if (svc == RPCSEC_GSS_SVC_PRIVACY) { + } else /* svc == RPCSEC_GSS_SVC_PRIVACY */ { /* Add header, padding, and trailer for the wrap */ xv_count = data_count + 3; gv_count = data_count + 3; From 2d13724606d6391c2cc485d2dbd0555cc6c1bcae Mon Sep 17 00:00:00 2001 From: Daniel Gryniewicz Date: Mon, 6 Apr 2020 14:30:20 -0400 Subject: [PATCH 63/70] VC - RELEASE after DESTROY Many error paths call DESTROY, which will unlink and drop the ref. This means that the final RELEASE will free, causing the DESTROY to use-after-free. Instead, make sure we DESTROY first. Signed-off-by: Daniel Gryniewicz --- src/clnt_vc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/clnt_vc.c b/src/clnt_vc.c index e680b5532b..57d48af71e 100644 --- a/src/clnt_vc.c +++ b/src/clnt_vc.c @@ -459,12 +459,14 @@ clnt_vc_destroy(CLIENT *clnt) struct cx_data *cx = CX_DATA(clnt); if (cx->cx_rec) { - SVC_RELEASE(&cx->cx_rec->xprt, SVC_RELEASE_FLAG_NONE); - if (clnt->cl_flags & CLNT_FLAG_LOCAL) { /* Local client; destroy the xprt */ SVC_DESTROY(&cx->cx_rec->xprt); } + + /* RELEASE after DESTROY in case an error case has already + * called DESTROY */ + SVC_RELEASE(&cx->cx_rec->xprt, SVC_RELEASE_FLAG_NONE); } clnt_vc_data_free(CT_DATA(cx)); } From d4f1c2d3b4e23f58c143998dd24defc63213a776 Mon Sep 17 00:00:00 2001 From: Daniel Gryniewicz Date: Tue, 14 Apr 2020 10:04:53 -0400 Subject: [PATCH 64/70] Avoid race checking for GSS init The check for initialization in GSS was not locked, so there could be a race where two threads both initialize. Fix this by moving the check inside the lock. Signed-off-by: Daniel Gryniewicz --- src/authgss_hash.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/src/authgss_hash.c b/src/authgss_hash.c index 8bb302006c..1d3ffedccd 100644 --- a/src/authgss_hash.c +++ b/src/authgss_hash.c @@ -103,6 +103,11 @@ authgss_hash_init() mutex_lock(&authgss_hash_st.lock); + if (authgss_hash_st.initialized) { + mutex_unlock(&authgss_hash_st.lock); + return; + } + code = rbtx_init(&authgss_hash_st.xt, svc_rpc_gss_cmpf, __svc_params->gss.ctx_hash_partitions, @@ -133,13 +138,6 @@ authgss_hash_init() mutex_unlock(&authgss_hash_st.lock); } -#define cond_init_authgss_hash() { \ - do { \ - if (!authgss_hash_st.initialized) \ - authgss_hash_init(); \ - } while (0); \ - } - struct svc_rpc_gss_data * authgss_ctx_hash_get(struct rpc_gss_cred *gc) { @@ -149,7 +147,7 @@ authgss_ctx_hash_get(struct rpc_gss_cred *gc) struct authgss_x_part *axp; struct rbtree_x_part *t; - cond_init_authgss_hash(); + authgss_hash_init(); gss_ctx = (gss_union_ctx_id_desc *) (gc->gc_ctx.value); gk.hk.k = gss_ctx_hash(gss_ctx); @@ -181,7 +179,7 @@ authgss_ctx_hash_set(struct svc_rpc_gss_data *gd) gss_union_ctx_id_desc *gss_ctx; bool rslt; - cond_init_authgss_hash(); + authgss_hash_init(); gss_ctx = (gss_union_ctx_id_desc *) (gd->ctx); gd->hk.k = gss_ctx_hash(gss_ctx); @@ -210,7 +208,7 @@ authgss_ctx_hash_del(struct svc_rpc_gss_data *gd) struct rbtree_x_part *t; struct authgss_x_part *axp; - cond_init_authgss_hash(); + authgss_hash_init(); t = rbtx_partition_of_scalar(&authgss_hash_st.xt, gd->hk.k); mutex_lock(&t->mtx); @@ -265,7 +263,7 @@ void authgss_ctx_gc_idle(void) struct svc_rpc_gss_data *gd; int ix, cnt, part; - cond_init_authgss_hash(); + authgss_hash_init(); for (ix = 0, cnt = 0, part = IDLE_NEXT(); ((ix < authgss_hash_st.xt.npart) && From 094c355b2ac45fdb3fe919b561035156b6735ae9 Mon Sep 17 00:00:00 2001 From: Madhu Thorat Date: Sun, 19 Apr 2020 16:28:17 -0400 Subject: [PATCH 65/70] In clnt_tli_ncreate() close FD in case client creation failed In clnt_tli_ncreate() if client creation fails then we don't close FD and cause FD leak. Fixed by closing FD in case of client creation failure. Signed-off-by: Madhu Thorat --- src/clnt_generic.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/clnt_generic.c b/src/clnt_generic.c index 4e43f7a9b1..b4f1eb53d1 100644 --- a/src/clnt_generic.c +++ b/src/clnt_generic.c @@ -401,6 +401,9 @@ clnt_tli_ncreate(int fd, const struct netconfig *nconf, goto err; } + if (CLNT_FAILURE(cl)) + goto err1; + if (flags & CLNT_CREATE_FLAG_CLOSE) { /* We got a new FD; this makes it a local client */ cl->cl_flags |= CLNT_FLAG_LOCAL; From c2f37f70695a840f6db0a2e13d64cdb4de4f043d Mon Sep 17 00:00:00 2001 From: Gaurav Gangalwar Date: Wed, 13 May 2020 04:54:31 -0400 Subject: [PATCH 66/70] work_pool: Fix for efficient scheduling. Seperate out work queue and worker queue. Always submit work to the work queue and wakeup available work thread. Each work thread will always check for entry in work queue, so that it can pickup next work without scheduling. Reduce the number of idle thread count, less number of threads helps in less scheduling overhead. --- ntirpc/rpc/work_pool.h | 1 + src/svc.c | 2 +- src/work_pool.c | 82 +++++++++++++++++++++++------------------- 3 files changed, 48 insertions(+), 37 deletions(-) diff --git a/ntirpc/rpc/work_pool.h b/ntirpc/rpc/work_pool.h index 8ec4751dcd..7cac26967f 100644 --- a/ntirpc/rpc/work_pool.h +++ b/ntirpc/rpc/work_pool.h @@ -72,6 +72,7 @@ struct work_pool_thread { char worker_name[16]; pthread_t pt; uint32_t worker_index; + bool wakeup; }; typedef void (*work_pool_fun_t) (struct work_pool_entry *); diff --git a/src/svc.c b/src/svc.c index faf3dfec40..c441b506aa 100644 --- a/src/svc.c +++ b/src/svc.c @@ -184,7 +184,7 @@ svc_init(svc_init_params *params) if (__svc_params->ioq.thrd_max < params->ioq_thrd_max) __svc_params->ioq.thrd_max = params->ioq_thrd_max; - work_pool_params.thrd_min = __svc_params->ioq.thrd_min + channels; + work_pool_params.thrd_min = __svc_params->ioq.thrd_min; work_pool_params.thrd_max = __svc_params->ioq.thrd_max; if (work_pool_params.thrd_max < work_pool_params.thrd_min) work_pool_params.thrd_max = work_pool_params.thrd_min; diff --git a/src/work_pool.c b/src/work_pool.c index 7a1b53967d..5d69f55715 100644 --- a/src/work_pool.c +++ b/src/work_pool.c @@ -154,7 +154,6 @@ work_pool_thread(void *arg) pthread_cond_init(&wpt->pqcond, NULL); pthread_mutex_lock(&pool->pqh.qmutex); - TAILQ_INSERT_TAIL(&pool->wptqh, wpt, wptq); wpt->worker_index = atomic_inc_uint32_t(&pool->worker_index); snprintf(wpt->worker_name, sizeof(wpt->worker_name), "%.5s%" PRIu32, @@ -185,21 +184,21 @@ work_pool_thread(void *arg) wpt->work = NULL; pthread_mutex_lock(&pool->pqh.qmutex); } - - if (0 > pool->pqh.qcount++) { - /* negative for task(s) */ - have = TAILQ_FIRST(&pool->pqh.qh); + /* + * Check for any queued work to avoid scheduling. + */ + have = TAILQ_FIRST(&pool->pqh.qh); + if (have) { TAILQ_REMOVE(&pool->pqh.qh, have, q); - wpt->work = (struct work_pool_entry *)have; continue; } - /* positive for waiting worker(s): - * use the otherwise empty pool to hold them, - * simplifying mutex and pointer setup. + /* + * Add myself to waiting queue. */ - TAILQ_INSERT_TAIL(&pool->pqh.qh, &wpt->pqe, q); + pool->pqh.qcount++; + TAILQ_INSERT_TAIL(&pool->wptqh, wpt, wptq); __warnx(TIRPC_DEBUG_FLAG_WORKER, "%s() %s waiting", @@ -208,32 +207,46 @@ work_pool_thread(void *arg) clock_gettime(CLOCK_REALTIME_FAST, &ts); timespec_addms(&ts, pool->timeout_ms); + wpt->wakeup = false; + /* Note: the mutex is the pool _head, * but the condition is per worker, * making the signal efficient! */ rc = pthread_cond_timedwait(&wpt->pqcond, &pool->pqh.qmutex, &ts); - if (!wpt->work) { - /* Allow for possible timing race: - * work entry can be submitted by another - * thread during the thread task switch - * after shutdown or timeout? - * Then, has already been removed there. - */ + + /* + * Wokeup after work submit. + * It could be shutdown also. + */ + if (!rc) { + if (wpt->wakeup) + continue; + } + + /* + * It could be timeout. + * There could be race if submit got lock and + * it will try to wakeup me. + */ + if (!wpt->wakeup) { pool->pqh.qcount--; - TAILQ_REMOVE(&pool->pqh.qh, &wpt->pqe, q); + TAILQ_REMOVE(&pool->wptqh, wpt, wptq); + } else { + continue; } + if (rc && rc != ETIMEDOUT) { __warnx(TIRPC_DEBUG_FLAG_ERROR, "%s() cond_timedwait failed (%d)\n", __func__, rc); break; } - } while (wpt->work || pool->pqh.qcount < pool->params.thrd_min); + } while (wpt->work || wpt->wakeup || + pool->pqh.qcount < pool->params.thrd_min); pool->n_threads--; - TAILQ_REMOVE(&pool->wptqh, wpt, wptq); pthread_mutex_unlock(&pool->pqh.qmutex); __warnx(TIRPC_DEBUG_FLAG_WORKER, @@ -274,26 +287,23 @@ work_pool_submit(struct work_pool *pool, struct work_pool_entry *work) /* queue is draining */ return (0); } - pthread_mutex_lock(&pool->pqh.qmutex); - - if (0 < pool->pqh.qcount--) { - struct work_pool_thread *wpt = (struct work_pool_thread *) - TAILQ_FIRST(&pool->pqh.qh); - /* positive for waiting worker(s) */ - TAILQ_REMOVE(&pool->pqh.qh, &wpt->pqe, q); - wpt->work = work; - - /* Note: the mutex is the pool _head, - * but the condition is per worker, - * making the signal efficient! - */ + pthread_mutex_lock(&pool->pqh.qmutex); + /* + * Insert in work queue so that running thread can + * pickup without scheduling. + */ + TAILQ_INSERT_TAIL(&pool->pqh.qh, &work->pqe, q); + struct work_pool_thread *wpt = TAILQ_LAST(&pool->wptqh, work_pool_s); + if (wpt) { + pool->pqh.qcount--; + TAILQ_REMOVE(&pool->wptqh, wpt, wptq); + assert(!wpt->wakeup); + wpt->wakeup = true; pthread_cond_signal(&wpt->pqcond); } else { - /* negative for task(s) */ - TAILQ_INSERT_TAIL(&pool->pqh.qh, &work->pqe, q); + assert(pool->pqh.qcount == 0); } - pthread_mutex_unlock(&pool->pqh.qmutex); return rc; } From c8310d07a58c5c56ad024f34ef3874f7bc262bf3 Mon Sep 17 00:00:00 2001 From: Gaurav Gangalwar Date: Thu, 14 May 2020 15:05:50 -0400 Subject: [PATCH 67/70] work_pool: Make sure thread max > channels. --- src/svc.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/svc.c b/src/svc.c index c441b506aa..b3db054acb 100644 --- a/src/svc.c +++ b/src/svc.c @@ -186,8 +186,13 @@ svc_init(svc_init_params *params) work_pool_params.thrd_min = __svc_params->ioq.thrd_min; work_pool_params.thrd_max = __svc_params->ioq.thrd_max; - if (work_pool_params.thrd_max < work_pool_params.thrd_min) - work_pool_params.thrd_max = work_pool_params.thrd_min; + /* + * thrd_max should > channels. + */ + if (work_pool_params.thrd_max < (work_pool_params.thrd_min + + channels)) + work_pool_params.thrd_max = work_pool_params.thrd_min + + channels; if (work_pool_init(&svc_work_pool, "svc_", &work_pool_params)) { mutex_unlock(&__svc_params->mtx); From f6f305838f672db65dbeac95c95bb889dbaf9b54 Mon Sep 17 00:00:00 2001 From: Gaurav Gangalwar Date: Tue, 2 Jun 2020 11:42:07 -0400 Subject: [PATCH 68/70] rqst: Fix hang in non blocking send. We are updating data->ptr while rearming fd_send, which is changing the data fd since its union, cauing IO is hung. Signed-off-by: Gaurav Gangalwar --- src/svc_rqst.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/svc_rqst.c b/src/svc_rqst.c index e6016683b9..1264e22330 100644 --- a/src/svc_rqst.c +++ b/src/svc_rqst.c @@ -718,9 +718,6 @@ svc_rqst_rearm_events_locked(SVCXPRT *xprt, uint16_t ev_flags) if (ev_flags & SVC_XPRT_FLAG_ADDED_SEND) { ev = &rec->ev_u.epoll.event_send; - /* set up epoll user data */ - ev->data.ptr = rec; - /* wait for write events, edge triggered, oneshot */ ev->events = EPOLLONESHOT | EPOLLOUT | EPOLLET; From 06dac58c1f24eaf171dd381eccfa620106920657 Mon Sep 17 00:00:00 2001 From: Ashish Sangwan Date: Tue, 30 Jun 2020 07:15:23 -0700 Subject: [PATCH 69/70] Fix mem_leak in xdr_rpc_gss_buf We need to free gss_buffer --- src/authgss_prot.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/authgss_prot.c b/src/authgss_prot.c index 23f2755284..d6bb450d92 100644 --- a/src/authgss_prot.c +++ b/src/authgss_prot.c @@ -101,6 +101,7 @@ xdr_rpc_gss_buf(XDR *xdrs, gss_buffer_t buf, u_int maxsize) case XDR_DECODE: return (xdr_rpc_gss_decode(xdrs, buf)); case XDR_FREE: + mem_free(buf->value, buf->length); return (TRUE); }; return (FALSE); From 031cebcc53d68723078a41bd425e07a7b2c0ce86 Mon Sep 17 00:00:00 2001 From: Malahal Naineni Date: Thu, 27 Aug 2020 17:43:39 +0530 Subject: [PATCH 70/70] Prevent set_threadgroups() failure Neither uid nor gid should be -1. If a client sends such a value, setfsuid, setfsuid or __NR_setgroups may fail. Check for invalid uid/gid and return a failure in such a case. Signed-off-by: Malahal Naineni --- src/svc_auth_unix.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/svc_auth_unix.c b/src/svc_auth_unix.c index 34f4f0e2a8..5390640379 100644 --- a/src/svc_auth_unix.c +++ b/src/svc_auth_unix.c @@ -93,7 +93,8 @@ _svcauth_unix(struct svc_req *req) aup->aup_uid = (int)IXDR_GET_INT32(buf); aup->aup_gid = (int)IXDR_GET_INT32(buf); gid_len = (size_t) IXDR_GET_U_INT32(buf); - if (gid_len > NGRPS) { + if (gid_len > NGRPS || aup->aup_uid == (uid_t)-1 || + aup->aup_gid == (gid_t)-1) { stat = AUTH_BADCRED; goto done; } @@ -101,6 +102,10 @@ _svcauth_unix(struct svc_req *req) for (i = 0; i < gid_len; i++) { /* suppress block warning */ aup->aup_gids[i] = (int)IXDR_GET_INT32(buf); + if (aup->aup_gids[i] == (gid_t)-1) { + stat = AUTH_BADCRED; + goto done; + } } /* * five is the smallest unix credentials structure -