Implement edge-triggered epoll

#698
This commit is contained in:
Theodore Dubois
2020-05-22 19:04:50 -07:00
parent 52f2c2e9e7
commit f38d232756
7 changed files with 159 additions and 62 deletions
+1 -10
View File
@@ -33,16 +33,7 @@ struct fd *fd_retain(struct fd *fd) {
int fd_close(struct fd *fd) {
int err = 0;
if (--fd->refcount == 0) {
lock(&fd->poll_lock);
struct poll_fd *poll_fd, *tmp;
list_for_each_entry_safe(&fd->poll_fds, poll_fd, tmp, polls) {
lock(&poll_fd->poll->lock);
list_remove(&poll_fd->polls);
list_remove(&poll_fd->fds);
unlock(&poll_fd->poll->lock);
free(poll_fd);
}
unlock(&fd->poll_lock);
poll_cleanup_fd(fd);
if (fd->ops->close)
err = fd->ops->close(fd);
// see comment in close in kernel/fs.h
+118 -34
View File
@@ -21,8 +21,8 @@
#define HAVE_KQUEUE 1
#endif
int real_poll_init(struct real_poll *real);
void real_poll_close(struct real_poll *real);
static int real_poll_init(struct real_poll *real);
static void real_poll_close(struct real_poll *real);
struct real_poll_event {
#if HAVE_EPOLL
struct epoll_event real;
@@ -30,8 +30,10 @@ struct real_poll_event {
struct kevent real;
#endif
};
int real_poll_wait(struct real_poll *real, struct real_poll_event *events, int max, struct timespec *timeout);
int real_poll_update(struct real_poll *real, int fd, int types);
static void *rpe_data(struct real_poll_event *rpe);
static int rpe_events(struct real_poll_event *rpe);
static int real_poll_wait(struct real_poll *real, struct real_poll_event *events, int max, struct timespec *timeout);
static int real_poll_update(struct real_poll *real, int fd, int types, void *data);
// lock order: fd, then poll
@@ -46,6 +48,7 @@ struct poll *poll_create() {
poll->notify_pipe[0] = -1;
poll->notify_pipe[1] = -1;
list_init(&poll->poll_fds);
list_init(&poll->pollfd_freelist);
lock_init(&poll->lock);
return poll;
}
@@ -64,6 +67,14 @@ static struct poll_fd *poll_find_fd(struct poll *poll, struct fd *fd) {
return NULL;
}
// See comment on pollfd_freelist for context
static void poll_fd_free(struct poll_fd *poll_fd) {
struct poll *poll = poll_fd->poll;
memset(poll_fd, 0xba, sizeof(*poll_fd));
poll_fd->poll = NULL; // used to mark it as free
list_add(&poll->pollfd_freelist, &poll_fd->fds);
}
bool poll_has_fd(struct poll *poll, struct fd *fd) {
return poll_find_fd(poll, fd) != NULL;
}
@@ -73,18 +84,25 @@ int poll_add_fd(struct poll *poll, struct fd *fd, int types, union poll_fd_info
lock(&fd->poll_lock);
lock(&poll->lock);
struct poll_fd *poll_fd = malloc(sizeof(struct poll_fd));
if (poll_fd == NULL) {
err = _ENOMEM;
goto out;
struct poll_fd *poll_fd;
if (!list_empty(&poll->pollfd_freelist)) {
poll_fd = list_first_entry(&poll->pollfd_freelist, struct poll_fd, fds);
list_remove(&poll_fd->fds);
} else {
poll_fd = malloc(sizeof(struct poll_fd));
if (poll_fd == NULL) {
err = _ENOMEM;
goto out;
}
}
poll_fd->fd = fd;
poll_fd->poll = poll;
poll_fd->types = types;
poll_fd->info = info;
poll_fd->triggered_types = 0;
if (poll_fd_is_real(poll_fd)) {
err = real_poll_update(&poll->real, fd->real_fd, types);
err = real_poll_update(&poll->real, fd->real_fd, types, poll_fd);
if (err < 0) {
free(poll_fd);
err = errno_map();
@@ -113,7 +131,7 @@ int poll_del_fd(struct poll *poll, struct fd *fd) {
}
if (poll_fd_is_real(poll_fd)) {
err = real_poll_update(&poll->real, fd->real_fd, 0);
err = real_poll_update(&poll->real, fd->real_fd, 0, poll_fd);
if (err < 0) {
err = errno_map();
goto out;
@@ -122,7 +140,7 @@ int poll_del_fd(struct poll *poll, struct fd *fd) {
list_remove(&poll_fd->polls);
list_remove(&poll_fd->fds);
free(poll_fd);
poll_fd_free(poll_fd);
err = 0;
out:
@@ -142,7 +160,7 @@ int poll_mod_fd(struct poll *poll, struct fd *fd, int types, union poll_fd_info
}
if (poll_fd_is_real(poll_fd)) {
err = real_poll_update(&poll->real, fd->real_fd, types);
err = real_poll_update(&poll->real, fd->real_fd, types, poll_fd);
if (err < 0) {
err = errno_map();
goto out;
@@ -151,6 +169,7 @@ int poll_mod_fd(struct poll *poll, struct fd *fd, int types, union poll_fd_info
poll_fd->types = types;
poll_fd->info = info;
poll_fd->triggered_types &= types;
err = 0;
out:
@@ -159,12 +178,29 @@ out:
return err;
}
void poll_wakeup(struct fd *fd) {
void poll_cleanup_fd(struct fd *fd) {
lock(&fd->poll_lock);
struct poll_fd *poll_fd, *tmp;
list_for_each_entry_safe(&fd->poll_fds, poll_fd, tmp, polls) {
lock(&poll_fd->poll->lock);
if (poll_fd_is_real(poll_fd))
real_poll_update(&poll_fd->poll->real, fd->real_fd, 0, poll_fd);
list_remove(&poll_fd->polls);
list_remove(&poll_fd->fds);
unlock(&poll_fd->poll->lock);
poll_fd_free(poll_fd);
}
unlock(&fd->poll_lock);
}
void poll_wakeup(struct fd *fd, int events) {
struct poll_fd *poll_fd;
lock(&fd->poll_lock);
list_for_each_entry(&fd->poll_fds, poll_fd, polls) {
struct poll *poll = poll_fd->poll;
lock(&poll->lock);
if (poll_fd->types & POLL_EDGETRIGGERED)
poll_fd->triggered_types &= ~events;
if (poll->notify_pipe[1] != -1)
write(poll->notify_pipe[1], "", 1);
unlock(&poll->lock);
@@ -185,7 +221,7 @@ int poll_wait(struct poll *poll_, poll_callback_t callback, void *context, struc
}
fcntl(poll_->notify_pipe[0], F_SETFL, O_NONBLOCK);
fcntl(poll_->notify_pipe[1], F_SETFL, O_NONBLOCK);
real_poll_update(&poll_->real, poll_->notify_pipe[0], POLL_READ);
real_poll_update(&poll_->real, poll_->notify_pipe[0], POLL_READ, NULL);
}
// TODO this is pretty broken with regards to timeouts
@@ -199,6 +235,9 @@ int poll_wait(struct poll *poll_, poll_callback_t callback, void *context, struc
if (fd->ops->poll)
poll_types = fd->ops->poll(fd);
poll_types &= poll_fd->types | POLL_HUP | POLL_ERR;
if (poll_fd->types & POLL_EDGETRIGGERED) {
poll_types &= ~poll_fd->triggered_types;
}
if (poll_types) {
if (callback(context, poll_types, poll_fd->info) == 1)
res++;
@@ -212,10 +251,14 @@ int poll_wait(struct poll *poll_, poll_callback_t callback, void *context, struc
list_remove(&poll_fd->polls);
list_remove(&poll_fd->fds);
if (poll_fd_is_real(poll_fd)) {
real_poll_update(&poll_->real, fd->real_fd, 0);
real_poll_update(&poll_->real, fd->real_fd, 0, NULL);
}
free(poll_fd);
}
if (poll_fd->types & POLL_EDGETRIGGERED) {
poll_fd->triggered_types |= poll_types;
}
}
}
if (res > 0)
@@ -227,9 +270,9 @@ int poll_wait(struct poll *poll_, poll_callback_t callback, void *context, struc
}
unlock(&poll_->lock);
int err;
struct real_poll_event e[4];
do {
struct real_poll_event e;
err = real_poll_wait(&poll_->real, &e, 1, timeout);
err = real_poll_wait(&poll_->real, e, sizeof(e)/sizeof(e[0]), timeout);
} while (sockrestart_should_restart_listen_wait() && errno == EINTR);
lock(&poll_->lock);
list_for_each_entry(&poll_->poll_fds, poll_fd, fds) {
@@ -245,6 +288,15 @@ int poll_wait(struct poll *poll_, poll_callback_t callback, void *context, struc
break;
}
// dead with any edge-triggered notifications
for (int i = 0; i < err; i++) {
struct poll_fd *triggered_poll_fd = rpe_data(&e[i]);
if (triggered_poll_fd != NULL && triggered_poll_fd->poll != NULL &&
triggered_poll_fd->types & POLL_EDGETRIGGERED) {
triggered_poll_fd->triggered_types &= ~rpe_events(&e[i]);
}
}
char fuck;
if (read(poll_->notify_pipe[0], &fuck, 1) < 0 && errno != EAGAIN) {
res = errno_map();
@@ -275,6 +327,11 @@ void poll_destroy(struct poll *poll) {
free(poll_fd);
}
list_for_each_entry_safe(&poll->pollfd_freelist, poll_fd, tmp, fds) {
list_remove(&poll_fd->fds);
free(poll_fd);
}
real_poll_close(&poll->real);
free(poll);
}
@@ -283,66 +340,93 @@ void poll_destroy(struct poll *poll) {
#if HAVE_EPOLL
int real_poll_init(struct real_poll *real) {
static int real_poll_init(struct real_poll *real) {
real->fd = epoll_create1(0);
if (real->fd < 0)
return -1;
return 0;
}
int real_poll_wait(struct real_poll *real, struct real_poll_event *events, int max, struct timespec *timeout) {
static int real_poll_wait(struct real_poll *real, struct real_poll_event *events, int max, struct timespec *timeout) {
int timeout_millis = -1;
if (timeout != NULL)
timeout_millis = timeout->tv_sec * 1000 + timeout->tv_nsec / 1000000;
return epoll_wait(real->fd, (struct epoll_event *) events, max, timeout_millis);
}
int real_poll_update(struct real_poll *real, int fd, int types) {
static int real_poll_update(struct real_poll *real, int fd, int types, void *data) {
types &= ~EPOLLONESHOT;
if (types == 0)
return epoll_ctl(real->fd, EPOLL_CTL_DEL, fd, NULL);
struct epoll_event epevent = {.events = types};
struct epoll_event epevent = {.events = types, .data.ptr = data};
int err = epoll_ctl(real->fd, EPOLL_CTL_MOD, fd, &epevent);
if (err < 0 && errno == ENOENT)
err = epoll_ctl(real->fd, EPOLL_CTL_ADD, fd, &epevent);
return err;
}
static void *rpe_data(struct real_poll_event *rpe) {
return rpe->real.data.ptr;
}
static int rpe_events(struct real_poll_event *rpe) {
return rpe->real.events;
}
#elif HAVE_KQUEUE
int real_poll_init(struct real_poll *real) {
static int real_poll_init(struct real_poll *real) {
real->fd = kqueue();
if (real->fd < 0)
return -1;
return 0;
}
int real_poll_update(struct real_poll *real, int fd, int types) {
struct kevent e[3]; // one for EVFILT_READ, EVFILT_WRITE, EVFILT_EXCEPT
if (types & POLL_READ) {
EV_SET(&e[0], fd, EVFILT_READ, EV_ADD, 0, 0, 0);
} else if (types & POLL_HUP) {
static int real_poll_update(struct real_poll *real, int fd, int types, void *data) {
struct kevent e[3] = {
{.filter = EVFILT_READ, .flags = types & (POLL_READ | POLL_HUP) ? EV_ADD : EV_DELETE},
{.filter = EVFILT_WRITE, .flags = types & POLL_WRITE ? EV_ADD : EV_DELETE},
{.filter = EVFILT_EXCEPT, .flags = types & POLL_ERR ? EV_ADD : EV_DELETE},
};
if (!(types & POLL_READ) && types & POLL_HUP) {
// Set the low water mark really high so we'll only get woken up on a hangup
EV_SET(&e[0], fd, EVFILT_READ, EV_ADD, NOTE_LOWAT, INT_MAX, 0);
} else {
EV_SET(&e[0], fd, EVFILT_READ, EV_DELETE, 0, 0, 0);
e[0].fflags = NOTE_LOWAT;
e[0].data = INT_MAX;
}
EV_SET(&e[1], fd, EVFILT_WRITE, types & POLL_WRITE ? EV_ADD : EV_DELETE, 0, 0, 0);
EV_SET(&e[2], fd, EVFILT_EXCEPT, types & POLL_ERR ? EV_ADD : EV_DELETE, 0, 0, 0);
for (int i = 0; i < 3; i++) {
e[i].ident = fd;
e[i].udata = data;
e[i].flags |= EV_RECEIPT;
if (types & POLL_EDGETRIGGERED)
e[i].flags |= EV_CLEAR;
}
return kevent(real->fd, e, 3, e, 3, NULL);
}
int real_poll_wait(struct real_poll *real, struct real_poll_event *events, int max, struct timespec *timeout) {
static int real_poll_wait(struct real_poll *real, struct real_poll_event *events, int max, struct timespec *timeout) {
return kevent(real->fd, NULL, 0, (struct kevent *) events, max, timeout);
}
static void *rpe_data(struct real_poll_event *rpe) {
return rpe->real.udata;
}
static int rpe_events(struct real_poll_event *rpe) {
if (rpe->real.filter == EVFILT_READ) {
int events = 0;
if (rpe->real.data > 0)
events |= POLL_READ;
if (rpe->real.flags & EV_EOF)
events |= POLL_HUP;
return events;
}
if (rpe->real.filter == EVFILT_WRITE) return POLL_WRITE;
if (rpe->real.filter == EVFILT_EXCEPT) return POLL_ERR;
return 0;
}
#endif
void real_poll_close(struct real_poll *real) {
static void real_poll_close(struct real_poll *real) {
close(real->fd);
}
+27 -1
View File
@@ -11,6 +11,22 @@ struct poll {
struct real_poll real;
int notify_pipe[2];
int waiters; // if nonzero, notify_pipe exists
// This is used to solve the race/UaF described here: https://lwn.net/Articles/520012/
// thread 1: calls poll_wait, real_poll_wait returns an event with a pointer to a poll_fd
// thread 2: calls poll_del_fd which frees the same poll_fd
//
// This can't be solved by adding locks because thread 1 could get
// suspended after real_poll_wait returns but before it has a chance to
// lock anything.
//
// An attempt was made to solve this with a Linux kernel patch, which
// almost went in 3.7 but was backed out after discussion at
// https://lkml.org/lkml/2012/10/16/302, and anyway wouldn't have solved
// the problem on Darwin. My solution is to just not free poll_fds, and
// instead move them to a freelist where they can be reused.
struct list pollfd_freelist;
lock_t lock;
};
@@ -24,6 +40,10 @@ struct poll_fd {
int fd;
uint64_t num;
} info;
// Used to implement edge-triggered notifications. When an event is
// returned its bits are set here, and those bits are ignored on the next
// call to poll_wait. The bits are cleared by poll_wakeup.
int triggered_types;
// locked by containing struct fd
struct poll *poll;
@@ -45,6 +65,7 @@ struct poll_fd {
#define POLL_HUP 16
#define POLL_NVAL 32
#define POLL_ONESHOT (1 << 30)
#define POLL_EDGETRIGGERED (1 << 31)
struct poll_event {
struct fd *fd;
int types;
@@ -54,8 +75,10 @@ bool poll_has_fd(struct poll *poll, struct fd *fd);
int poll_add_fd(struct poll *poll, struct fd *fd, int types, union poll_fd_info info);
int poll_mod_fd(struct poll *poll, struct fd *fd, int types, union poll_fd_info info);
int poll_del_fd(struct poll *poll, struct fd *fd);
// Indicates that the specified events have been triggered. Each call will
// generate a new edge-triggered notification.
// please do not call this while holding any locks you would acquire in your poll operation
void poll_wakeup(struct fd *fd);
void poll_wakeup(struct fd *fd, int events);
// Waits for events on the fds in this poll, and calls the callback for each one found.
// Returns the number of times the callback returned 1, or negative for error.
typedef int (*poll_callback_t)(void *context, int types, union poll_fd_info info);
@@ -64,4 +87,7 @@ int poll_wait(struct poll *poll, poll_callback_t callback, void *context, struct
// thread will add or remove fds from this poll
void poll_destroy(struct poll *poll);
// for fd_close
void poll_cleanup_fd(struct fd *fd);
#endif
+5 -5
View File
@@ -80,12 +80,12 @@ struct tty *tty_get(struct tty_driver *driver, int type, int num) {
return tty;
}
static void tty_poll_wakeup(struct tty *tty) {
static void tty_poll_wakeup(struct tty *tty, int events) {
unlock(&tty->lock);
struct fd *fd;
lock(&tty->fds_lock);
list_for_each_entry(&tty->fds, fd, tty_other_fds) {
poll_wakeup(fd);
poll_wakeup(fd, events);
}
unlock(&tty->fds_lock);
lock(&tty->lock);
@@ -109,7 +109,7 @@ void tty_release(struct tty *tty) {
unlock(&tty->lock);
if (master != NULL) {
lock(&master->lock);
tty_poll_wakeup(master);
tty_poll_wakeup(master, POLL_READ | POLL_HUP);
unlock(&master->lock);
}
}
@@ -211,7 +211,7 @@ static int tty_close(struct fd *fd) {
static void tty_input_wakeup(struct tty *tty) {
notify(&tty->produced);
tty_poll_wakeup(tty);
tty_poll_wakeup(tty, POLL_READ);
}
static int tty_push_char(struct tty *tty, char ch, bool flag, int blocking) {
@@ -767,7 +767,7 @@ void tty_set_winsize(struct tty *tty, struct winsize_ winsize) {
void tty_hangup(struct tty *tty) {
tty->hung_up = true;
tty_poll_wakeup(tty);
tty_poll_wakeup(tty, POLL_READ | POLL_WRITE | POLL_ERR | POLL_HUP);
}
struct dev_ops tty_dev = {
+5 -9
View File
@@ -50,13 +50,6 @@ int_t sys_epoll_ctl(fd_t epoll_f, int_t op, fd_t f, addr_t event_addr) {
if (user_get(event_addr, event))
return _EFAULT;
STRACE(" {events: %#x, data: %#x}", event.events, event.data);
if (event.events & EPOLLET_) {
// The exact semantics of EPOLLET are hard to emulate on Darwin, so
// let's play it safe. Common patterns using EPOLLET will work fine
// without it, albiet inefficiently.
TRACE("ignoring EPOLLET\n");
event.events &= ~EPOLLET_;
}
if (op == EPOLL_CTL_ADD_) {
if (poll_has_fd(epoll->epollfd.poll, fd))
@@ -77,7 +70,6 @@ static int epoll_callback(void *context, int types, union poll_fd_info info) {
struct epoll_context *c = context;
if (c->n >= c->max_events)
return 0;
STRACE(" {events: %#x, data: %#x}", types, info.num);
c->events[c->n++] = (struct epoll_event_) {.events = types, .data = info.num};
return 1;
}
@@ -103,9 +95,13 @@ int_t sys_epoll_wait(fd_t epoll_f, addr_t events_addr, int_t max_events, int_t t
STRACE("...\n");
int res = poll_wait(epoll->epollfd.poll, epoll_callback, &context, timeout < 0 ? NULL : &timeout_ts);
STRACE("%d end epoll_wait", current->pid);
if (res >= 0)
if (res >= 0) {
for (int i = 0; i < res; i++) {
STRACE(" {events: %#x, data: %#x}", events[i].events, events[i].data);
}
if (user_write(events_addr, events, sizeof(struct epoll_event_) * res))
return _EFAULT;
}
return res;
}
+2 -2
View File
@@ -39,7 +39,7 @@ static ssize_t eventfd_read(struct fd *fd, void *buf, size_t bufsize) {
fd->eventfd.val = 0;
notify(&fd->cond);
unlock(&fd->lock);
poll_wakeup(fd);
poll_wakeup(fd, POLL_WRITE);
return sizeof(uint64_t);
}
@@ -65,7 +65,7 @@ static ssize_t eventfd_write(struct fd *fd, const void *buf, size_t bufsize) {
fd->eventfd.val += increment;
notify(&fd->cond);
unlock(&fd->lock);
poll_wakeup(fd);
poll_wakeup(fd, POLL_READ);
return sizeof(uint64_t);
}
+1 -1
View File
@@ -336,7 +336,7 @@ static void timerfd_callback(struct fd *fd) {
fd->timerfd.expirations++;
notify(&fd->cond);
unlock(&fd->lock);
poll_wakeup(fd);
poll_wakeup(fd, POLL_READ);
}
fd_t sys_timerfd_create(int_t clockid, int_t flags) {