aboutsummaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2020-12-19 11:39:50 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2020-12-19 11:39:50 -0800
commit1db98bcf56b1126428d59e71b815c310f73e44ab (patch)
tree086bf1626c161872805a6ba10b1f944646bdb4db /fs
parent3644e2d2dda78e21edd8f5415b6d7ab03f5f54f3 (diff)
parent01ab1ede9190b9fafa88b1c97129c034fa9aa437 (diff)
Merge branch 'akpm' (patches from Andrew)
Merge still more updates from Andrew Morton: "18 patches. Subsystems affected by this patch series: mm (memcg and cleanups) and epoll" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: mm/Kconfig: fix spelling mistake "whats" -> "what's" selftests/filesystems: expand epoll with epoll_pwait2 epoll: wire up syscall epoll_pwait2 epoll: add syscall epoll_pwait2 epoll: convert internal api to timespec64 epoll: eliminate unnecessary lock for zero timeout epoll: replace gotos with a proper loop epoll: pull all code between fetch_events and send_event into the loop epoll: simplify and optimize busy loop logic epoll: move eavail next to the list_empty_careful check epoll: pull fatal signal checks into ep_send_events() epoll: simplify signal handling epoll: check for events when removing a timed out thread from the wait queue mm/memcontrol:rewrite mem_cgroup_page_lruvec() mm, kvm: account kvm_vcpu_mmap to kmemcg mm/memcg: remove unused definitions mm/memcg: warning on !memcg after readahead page charged mm/memcg: bail early from swap accounting if memcg disabled
Diffstat (limited to 'fs')
-rw-r--r--fs/eventpoll.c287
1 files changed, 181 insertions, 106 deletions
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 10b81e69db74..a829af074eb5 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -389,19 +389,24 @@ static bool ep_busy_loop_end(void *p, unsigned long start_time)
*
* we must do our busy polling with irqs enabled
*/
-static void ep_busy_loop(struct eventpoll *ep, int nonblock)
+static bool ep_busy_loop(struct eventpoll *ep, int nonblock)
{
unsigned int napi_id = READ_ONCE(ep->napi_id);
- if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on())
+ if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) {
napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false,
BUSY_POLL_BUDGET);
-}
-
-static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep)
-{
- if (ep->napi_id)
+ if (ep_events_available(ep))
+ return true;
+ /*
+ * Busy poll timed out. Drop NAPI ID for now, we can add
+ * it back in when we have moved a socket with a valid NAPI
+ * ID onto the ready list.
+ */
ep->napi_id = 0;
+ return false;
+ }
+ return false;
}
/*
@@ -441,12 +446,9 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
#else
-static inline void ep_busy_loop(struct eventpoll *ep, int nonblock)
-{
-}
-
-static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep)
+static inline bool ep_busy_loop(struct eventpoll *ep, int nonblock)
{
+ return false;
}
static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
@@ -1625,6 +1627,14 @@ static int ep_send_events(struct eventpoll *ep,
poll_table pt;
int res = 0;
+ /*
+ * Always short-circuit for fatal signals to allow threads to make a
+ * timely exit without the chance of finding more events available and
+ * fetching repeatedly.
+ */
+ if (fatal_signal_pending(current))
+ return -EINTR;
+
init_poll_funcptr(&pt, NULL);
mutex_lock(&ep->mtx);
@@ -1702,15 +1712,25 @@ static int ep_send_events(struct eventpoll *ep,
return res;
}
-static inline struct timespec64 ep_set_mstimeout(long ms)
+static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms)
{
- struct timespec64 now, ts = {
- .tv_sec = ms / MSEC_PER_SEC,
- .tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC),
- };
+ struct timespec64 now;
+
+ if (ms < 0)
+ return NULL;
+
+ if (!ms) {
+ to->tv_sec = 0;
+ to->tv_nsec = 0;
+ return to;
+ }
+
+ to->tv_sec = ms / MSEC_PER_SEC;
+ to->tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC);
ktime_get_ts64(&now);
- return timespec64_add_safe(now, ts);
+ *to = timespec64_add_safe(now, *to);
+ return to;
}
/**
@@ -1722,8 +1742,8 @@ static inline struct timespec64 ep_set_mstimeout(long ms)
* stored.
* @maxevents: Size (in terms of number of events) of the caller event buffer.
* @timeout: Maximum timeout for the ready events fetch operation, in
- * milliseconds. If the @timeout is zero, the function will not block,
- * while if the @timeout is less than zero, the function will block
+ * timespec. If the timeout is zero, the function will not block,
+ * while if the @timeout ptr is NULL, the function will block
* until at least one event has been retrieved (or an error
* occurred).
*
@@ -1731,55 +1751,59 @@ static inline struct timespec64 ep_set_mstimeout(long ms)
* error code, in case of error.
*/
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
- int maxevents, long timeout)
+ int maxevents, struct timespec64 *timeout)
{
- int res = 0, eavail, timed_out = 0;
+ int res, eavail, timed_out = 0;
u64 slack = 0;
wait_queue_entry_t wait;
ktime_t expires, *to = NULL;
lockdep_assert_irqs_enabled();
- if (timeout > 0) {
- struct timespec64 end_time = ep_set_mstimeout(timeout);
-
- slack = select_estimate_accuracy(&end_time);
+ if (timeout && (timeout->tv_sec | timeout->tv_nsec)) {
+ slack = select_estimate_accuracy(timeout);
to = &expires;
- *to = timespec64_to_ktime(end_time);
- } else if (timeout == 0) {
+ *to = timespec64_to_ktime(*timeout);
+ } else if (timeout) {
/*
* Avoid the unnecessary trip to the wait queue loop, if the
- * caller specified a non blocking operation. We still need
- * lock because we could race and not see an epi being added
- * to the ready list while in irq callback. Thus incorrectly
- * returning 0 back to userspace.
+ * caller specified a non blocking operation.
*/
timed_out = 1;
-
- write_lock_irq(&ep->lock);
- eavail = ep_events_available(ep);
- write_unlock_irq(&ep->lock);
-
- goto send_events;
}
-fetch_events:
+ /*
+ * This call is racy: We may or may not see events that are being added
+ * to the ready list under the lock (e.g., in IRQ callbacks). For, cases
+ * with a non-zero timeout, this thread will check the ready list under
+ * lock and will added to the wait queue. For, cases with a zero
+ * timeout, the user by definition should not care and will have to
+ * recheck again.
+ */
+ eavail = ep_events_available(ep);
+
+ while (1) {
+ if (eavail) {
+ /*
+ * Try to transfer events to user space. In case we get
+ * 0 events and there's still timeout left over, we go
+ * trying again in search of more luck.
+ */
+ res = ep_send_events(ep, events, maxevents);
+ if (res)
+ return res;
+ }
- if (!ep_events_available(ep))
- ep_busy_loop(ep, timed_out);
+ if (timed_out)
+ return 0;
- eavail = ep_events_available(ep);
- if (eavail)
- goto send_events;
+ eavail = ep_busy_loop(ep, timed_out);
+ if (eavail)
+ continue;
- /*
- * Busy poll timed out. Drop NAPI ID for now, we can add
- * it back in when we have moved a socket with a valid NAPI
- * ID onto the ready list.
- */
- ep_reset_busy_poll_napi_id(ep);
+ if (signal_pending(current))
+ return -EINTR;
- do {
/*
* Internally init_wait() uses autoremove_wake_function(),
* thus wait entry is removed from the wait queue on each
@@ -1809,55 +1833,38 @@ fetch_events:
* important.
*/
eavail = ep_events_available(ep);
- if (!eavail) {
- if (signal_pending(current))
- res = -EINTR;
- else
- __add_wait_queue_exclusive(&ep->wq, &wait);
- }
- write_unlock_irq(&ep->lock);
-
- if (eavail || res)
- break;
-
- if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) {
- timed_out = 1;
- break;
- }
-
- /* We were woken up, thus go and try to harvest some events */
- eavail = 1;
-
- } while (0);
+ if (!eavail)
+ __add_wait_queue_exclusive(&ep->wq, &wait);
- __set_current_state(TASK_RUNNING);
-
- if (!list_empty_careful(&wait.entry)) {
- write_lock_irq(&ep->lock);
- __remove_wait_queue(&ep->wq, &wait);
write_unlock_irq(&ep->lock);
- }
-send_events:
- if (fatal_signal_pending(current)) {
+ if (!eavail)
+ timed_out = !schedule_hrtimeout_range(to, slack,
+ HRTIMER_MODE_ABS);
+ __set_current_state(TASK_RUNNING);
+
/*
- * Always short-circuit for fatal signals to allow
- * threads to make a timely exit without the chance of
- * finding more events available and fetching
- * repeatedly.
+ * We were woken up, thus go and try to harvest some events.
+ * If timed out and still on the wait queue, recheck eavail
+ * carefully under lock, below.
*/
- res = -EINTR;
- }
- /*
- * Try to transfer events to user space. In case we get 0 events and
- * there's still timeout left over, we go trying again in search of
- * more luck.
- */
- if (!res && eavail &&
- !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
- goto fetch_events;
+ eavail = 1;
- return res;
+ if (!list_empty_careful(&wait.entry)) {
+ write_lock_irq(&ep->lock);
+ /*
+ * If the thread timed out and is not on the wait queue,
+ * it means that the thread was woken up after its
+ * timeout expired before it could reacquire the lock.
+ * Thus, when wait.entry is empty, it needs to harvest
+ * events.
+ */
+ if (timed_out)
+ eavail = list_empty(&wait.entry);
+ __remove_wait_queue(&ep->wq, &wait);
+ write_unlock_irq(&ep->lock);
+ }
+ }
}
/**
@@ -2176,7 +2183,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
* part of the user space epoll_wait(2).
*/
static int do_epoll_wait(int epfd, struct epoll_event __user *events,
- int maxevents, int timeout)
+ int maxevents, struct timespec64 *to)
{
int error;
struct fd f;
@@ -2210,7 +2217,7 @@ static int do_epoll_wait(int epfd, struct epoll_event __user *events,
ep = f.file->private_data;
/* Time to fish for events ... */
- error = ep_poll(ep, events, maxevents, timeout);
+ error = ep_poll(ep, events, maxevents, to);
error_fput:
fdput(f);
@@ -2220,16 +2227,19 @@ error_fput:
SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
int, maxevents, int, timeout)
{
- return do_epoll_wait(epfd, events, maxevents, timeout);
+ struct timespec64 to;
+
+ return do_epoll_wait(epfd, events, maxevents,
+ ep_timeout_to_timespec(&to, timeout));
}
/*
* Implement the event wait interface for the eventpoll file. It is the kernel
* part of the user space epoll_pwait(2).
*/
-SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
- int, maxevents, int, timeout, const sigset_t __user *, sigmask,
- size_t, sigsetsize)
+static int do_epoll_pwait(int epfd, struct epoll_event __user *events,
+ int maxevents, struct timespec64 *to,
+ const sigset_t __user *sigmask, size_t sigsetsize)
{
int error;
@@ -2241,18 +2251,47 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
if (error)
return error;
- error = do_epoll_wait(epfd, events, maxevents, timeout);
+ error = do_epoll_wait(epfd, events, maxevents, to);
+
restore_saved_sigmask_unless(error == -EINTR);
return error;
}
+SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
+ int, maxevents, int, timeout, const sigset_t __user *, sigmask,
+ size_t, sigsetsize)
+{
+ struct timespec64 to;
+
+ return do_epoll_pwait(epfd, events, maxevents,
+ ep_timeout_to_timespec(&to, timeout),
+ sigmask, sigsetsize);
+}
+
+SYSCALL_DEFINE6(epoll_pwait2, int, epfd, struct epoll_event __user *, events,
+ int, maxevents, const struct __kernel_timespec __user *, timeout,
+ const sigset_t __user *, sigmask, size_t, sigsetsize)
+{
+ struct timespec64 ts, *to = NULL;
+
+ if (timeout) {
+ if (get_timespec64(&ts, timeout))
+ return -EFAULT;
+ to = &ts;
+ if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
+ return -EINVAL;
+ }
+
+ return do_epoll_pwait(epfd, events, maxevents, to,
+ sigmask, sigsetsize);
+}
+
#ifdef CONFIG_COMPAT
-COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
- struct epoll_event __user *, events,
- int, maxevents, int, timeout,
- const compat_sigset_t __user *, sigmask,
- compat_size_t, sigsetsize)
+static int do_compat_epoll_pwait(int epfd, struct epoll_event __user *events,
+ int maxevents, struct timespec64 *timeout,
+ const compat_sigset_t __user *sigmask,
+ compat_size_t sigsetsize)
{
long err;
@@ -2265,10 +2304,46 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
return err;
err = do_epoll_wait(epfd, events, maxevents, timeout);
+
restore_saved_sigmask_unless(err == -EINTR);
return err;
}
+
+COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
+ struct epoll_event __user *, events,
+ int, maxevents, int, timeout,
+ const compat_sigset_t __user *, sigmask,
+ compat_size_t, sigsetsize)
+{
+ struct timespec64 to;
+
+ return do_compat_epoll_pwait(epfd, events, maxevents,
+ ep_timeout_to_timespec(&to, timeout),
+ sigmask, sigsetsize);
+}
+
+COMPAT_SYSCALL_DEFINE6(epoll_pwait2, int, epfd,
+ struct epoll_event __user *, events,
+ int, maxevents,
+ const struct __kernel_timespec __user *, timeout,
+ const compat_sigset_t __user *, sigmask,
+ compat_size_t, sigsetsize)
+{
+ struct timespec64 ts, *to = NULL;
+
+ if (timeout) {
+ if (get_timespec64(&ts, timeout))
+ return -EFAULT;
+ to = &ts;
+ if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
+ return -EINVAL;
+ }
+
+ return do_compat_epoll_pwait(epfd, events, maxevents, to,
+ sigmask, sigsetsize);
+}
+
#endif
static int __init eventpoll_init(void)