/*
* This structure is stored inside the "private_data" member of the file
* structure and rapresent the main data sructure for the eventpoll
* interface.
*/
struct eventpoll {
/* Protect the this structure access */
rwlock_t lock;
/*
* This semaphore is used to ensure that files are not removed
* while epoll is using them. This is read-held during the event
* collection loop and it is write-held during the file cleanup
* path, the epoll file exit code and the ctl operations.
*/
struct rw_semaphore sem;
/* Wait queue used by sys_epoll_wait() */
wait_queue_head_t wq;
/* Wait queue used by file->poll() */
wait_queue_head_t poll_wait;
/* List of ready file descriptors */
struct list_head rdllist;
/* RB-Tree root used to store monitored fd structs */
struct rb_root rbr;
};
/*
* It opens an eventpoll file descriptor by suggesting a storage of "size"
* file descriptors. The size parameter is just an hint about how to size
* data structures. It won't prevent the user to store more than "size"
* file descriptors inside the epoll interface. It is the kernel part of
* the userspace epoll_create(2).
*/
//注意這個size參數(shù)只是一個參考值,而沒有被使用到。
//這個函數(shù)就是用于建立一個eventpoll結(jié)構(gòu), 然后返回一個文件描述符,
//通過這個文件描述符可以獲取這個相應(yīng)的eventpoll結(jié)構(gòu).
asmlinkage long sys_epoll_create(int size)
{
int error, fd = -1;
struct eventpoll *ep;
struct inode *inode;
struct file *file;
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n",
current, size));
/*
* Sanity check on the size parameter, and create the internal data
* structure ( "struct eventpoll" ).
*/
error = -EINVAL;
if (size <= 0 || (error = ep_alloc(&ep)) != 0) //分配并初始化一個eventpoll結(jié)構(gòu)
goto eexit_1;
/*
* Creates all the items needed to setup an eventpoll file. That is,
* a file structure, and inode and a free file descriptor.
*/
//在這個eventpoll文件系統(tǒng)中分配inode節(jié)點(diǎn),并且分配相應(yīng)的fd, file結(jié)構(gòu),
//并有file->private_data = ep, 就是說將ep與這個file/fd聯(lián)系起來.
error = ep_getfd(&fd, &inode, &file, ep);
if (error)
goto eexit_2;
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
current, size, fd));
return fd;
eexit_2:
ep_free(ep);
kfree(ep);
eexit_1:
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
current, size, error));
return error;
}
struct epoll_event {
__u32 events;
__u64 data;
} EPOLL_PACKED;
/*
* Each file descriptor added to the eventpoll interface will
* have an entry of this type linked to the hash.
*/
struct epitem {
/* RB-Tree node used to link this structure to the eventpoll rb-tree */
struct rb_node rbn;
/* List header used to link this structure to the eventpoll ready list */
struct list_head rdllink;
/* The file descriptor information this item refers to */
struct epoll_filefd ffd;
/* Number of active wait queue attached to poll operations */
int nwait;
/* List containing poll wait queues */
struct list_head pwqlist;
/* The "container" of this item */
struct eventpoll *ep;
/* The structure that describe the interested events and the source fd */
struct epoll_event event;
/*
* Used to keep track of the usage count of the structure. This avoids
* that the structure will desappear from underneath our processing.
*/
atomic_t usecnt;
/* List header used to link this item to the "struct file" items list */
struct list_head fllink;
/* List header used to link the item to the transfer list */
struct list_head txlink;
/*
* This is used during the collection/transfer of events to userspace
* to pin items empty events set.
*/
unsigned int revents;
};
/*
* The following function implements the controller interface for
* the eventpoll file that enables the insertion/removal/change of
* file descriptors inside the interest set. It represents
* the kernel part of the user space epoll_ctl(2).
*/
asmlinkage long
sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event)
{
int error;
struct file *file, *tfile;
struct eventpoll *ep;
struct epitem *epi; //每個fd對應(yīng)一個epitem結(jié)構(gòu),,它是epoll_ctl中insertion/removal/change操作的對象
struct epoll_event epds;
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n",
current, epfd, op, fd, event));
error = -EFAULT;
if (ep_op_hash_event(op) &&
copy_from_user(&epds, event, sizeof(struct epoll_event)))
goto eexit_1;
/* Get the "struct file *" for the eventpoll file */
error = -EBADF;
file = fget(epfd); //eventpoll的file結(jié)構(gòu)
if (!file)
goto eexit_1;
/* Get the "struct file *" for the target file */
tfile = fget(fd); //待處理的file結(jié)構(gòu)
if (!tfile)
goto eexit_2;
/* The target file descriptor must support poll */
error = -EPERM;
if (!tfile->f_op || !tfile->f_op->poll) //待處理的file必須支持poll
goto eexit_3;
/*
* We have to check that the file structure underneath the file descriptor
* the user passed to us _is_ an eventpoll file. And also we do not permit
* adding an epoll file descriptor inside itself.
*/
error = -EINVAL;
//不能自己加自己, 但是可以將一個epoll file descriptor加入到另個個epoll/poll/select中
if (file == tfile || !is_file_epoll(file))
goto eexit_3;
/*
* At this point it is safe to assume that the "private_data" contains
* our own data structure.
*/
ep = file->private_data; //取出eventpoll結(jié)構(gòu)。
down_write(&ep->sem);
/* Try to lookup the file inside our hash table */
epi = ep_find(ep, tfile, fd); //從紅黑數(shù)中查找fd對應(yīng)的epi, epi->usecnt++
error = -EINVAL;
switch (op) {
case EPOLL_CTL_ADD: //插入操作
if (!epi) {
epds.events |= POLLERR | POLLHUP;
error = ep_insert(ep, &epds, tfile, fd);
} else
error = -EEXIST; //如果已經(jīng)存在, 插入出錯
break;
case EPOLL_CTL_DEL: //刪除操作
if (epi)
error = ep_remove(ep, epi);
else
error = -ENOENT;
break;
case EPOLL_CTL_MOD: //修改操作
if (epi) {
epds.events |= POLLERR | POLLHUP;
error = ep_modify(ep, epi, &epds);
} else
error = -ENOENT;
break;
}
/*
* The function ep_find() increments the usage count of the structure
* so, if this is not NULL, we need to release it.
*/
if (epi)
ep_release_epitem(epi); // epi->usecnt++, if epi->usecnt==0, 釋放這個epi
up_write(&ep->sem);
eexit_3:
fput(tfile);
eexit_2:
fput(file);
eexit_1:
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n",
current, epfd, op, fd, event, error));
return error;
}
/* Wrapper struct used by poll queueing */
struct ep_pqueue {
poll_table pt;
struct epitem *epi;
};
static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
struct file *tfile, int fd)
{
int error, revents, pwake = 0;
unsigned long flags;
struct epitem *epi;
struct ep_pqueue epq;
error = -ENOMEM; //插入操作,先分配一個epi
if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
goto eexit_1;
//初始化epi
/* Item initialization follow here ... */
ep_rb_initnode(&epi->rbn);
INIT_LIST_HEAD(&epi->rdllink);
INIT_LIST_HEAD(&epi->fllink);
INIT_LIST_HEAD(&epi->txlink);
INIT_LIST_HEAD(&epi->pwqlist);
epi->ep = ep; //eventpoll
ep_set_ffd(&epi->ffd, tfile, fd); //待處理的文件描述符
epi->event = *event; //用戶關(guān)注的事件
atomic_set(&epi->usecnt, 1);
epi->nwait = 0;
//注意以下的初始化, 它用ep_ptable_queue_proc這個函數(shù)初始化poll_table,
//類似于在poll()實(shí)現(xiàn)中它是用__waitpoll()來初始化這個函數(shù). 這個函數(shù)會在
//poll_wait()中被調(diào)用,而poll_wait()則在驅(qū)動或文件系統(tǒng)的file_operations->poll
//中被調(diào)用.
/* Initialize the poll table using the queue callback */
epq.epi = epi;
init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
/*
* Attach the item to the poll hooks and get current event bits.
* We can safely use the file* here because its usage count has
* been increased by the caller of this function.
*/
revents = tfile->f_op->poll(tfile, &epq.pt);
/*
* We have to check if something went wrong during the poll wait queue
* install process. Namely an allocation for a wait queue failed due
* high memory pressure.
*/
if (epi->nwait < 0)
goto eexit_2;
/* Add the current item to the list of active epoll hook for this file */
spin_lock(&tfile->f_ep_lock);
list_add_tail(&epi->fllink, &tfile->f_ep_links);
spin_unlock(&tfile->f_ep_lock);
/* We have to drop the new item inside our item list to keep track of it */
write_lock_irqsave(&ep->lock, flags);
/* Add the current item to the rb-tree */
ep_rbtree_insert(ep, epi); //插入到樹中
//如果可讀,直接加入到相應(yīng)的eventpoll的ep->rdllist中,
/* If the file is already "ready" we drop it inside the ready list */
if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
/* Notify waiting tasks that events are available */
if (waitqueue_active(&ep->wq)) //如果這個睡眠隊(duì)列非空, 喚醒睡眠進(jìn)程
__wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE);
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
write_unlock_irqrestore(&ep->lock, flags);
/* We have to call this outside the lock */
if (pwake)
ep_poll_safewake(&psw, &ep->poll_wait); //???
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)\n",
current, ep, tfile, fd));
return 0;
eexit_2:
ep_unregister_pollwait(ep, epi);
/*
* We need to do this because an event could have been arrived on some
* allocated wait queue.
*/
write_lock_irqsave(&ep->lock, flags);
if (ep_is_linked(&epi->rdllink))
ep_list_del(&epi->rdllink);
write_unlock_irqrestore(&ep->lock, flags);
kmem_cache_free(epi_cache, epi);
eexit_1:
return error;
}
static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
{
pt->qproc = qproc;
}
/*
* This is the callback that is used to add our wait queue to the
* target file wakeup lists.
*/
//這個函數(shù)用于將當(dāng)前進(jìn)程加入到設(shè)備的睡眠隊(duì)列中去,這樣,當(dāng)設(shè)備有數(shù)據(jù)可讀寫時(shí),
//設(shè)備的read/write函數(shù)會調(diào)用wake_up()喚醒睡眠在這個隊(duì)列上的進(jìn)程.
static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
poll_table *pt)
{
struct epitem *epi = ep_item_from_epqueue(pt);
struct eppoll_entry *pwq;
if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
//將wait的函數(shù)初始化為ep_poll_callback,當(dāng)進(jìn)程被喚醒時(shí),他就會執(zhí)行這個函數(shù).
init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
pwq->whead = whead;
pwq->base = epi;
add_wait_queue(whead, &pwq->wait); //加入隊(duì)列
list_add_tail(&pwq->llink, &epi->pwqlist);
epi->nwait++;
} else {
/* We have to signal that an error occurred */
epi->nwait = -1;
}
}
/*
* This is the callback that is passed to the wait queue wakeup
* machanism. It is called by the stored file descriptors when they
* have events to report.
*/
//這個函數(shù)體現(xiàn)了epoll于poll/select的本質(zhì)區(qū)別. 在poll/select中,它是通過遍歷所有的文件描述符
//來檢查每個文件描述符是否有數(shù)據(jù)可讀寫, 但是在epoll中,它是在一個文件可讀寫時(shí),通過wait_up()
//調(diào)用以下這個wait的callback函數(shù), 將這個有數(shù)據(jù)可讀寫的文件描述符加入到ready隊(duì)列中去.
//所以, 它就不用遍歷所有的文件描述符,而只是這個ready隊(duì)列而已了.
static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
int pwake = 0;
unsigned long flags;
struct epitem *epi = ep_item_from_wait(wait);
struct eventpoll *ep = epi->ep;
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p\n",
current, epi->ffd.file, epi, ep));
write_lock_irqsave(&ep->lock, flags);
/*
* If the event mask does not contain any poll(2) event, we consider the
* descriptor to be disabled. This condition is likely the effect of the
* EPOLLONESHOT bit that disables the descriptor when an event is received,
* until the next EPOLL_CTL_MOD will be issued.
*/
if (!(epi->event.events & ~EP_PRIVATE_BITS))
goto is_disabled;
/* If this file is already in the ready list we exit soon */
if (ep_is_linked(&epi->rdllink))
goto is_linked;
list_add_tail(&epi->rdllink, &ep->rdllist); //加入到Ready隊(duì)列中去
is_linked:
/*
* Wake up ( if active ) both the eventpoll wait list and the ->poll()
* wait list.
*/
if (waitqueue_active(&ep->wq)) //有數(shù)據(jù)可讀, 喚醒
__wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE |
TASK_INTERRUPTIBLE);
if (waitqueue_active(&ep->poll_wait))
pwake++;
is_disabled:
write_unlock_irqrestore(&ep->lock, flags);
/* We have to call this outside the lock */
if (pwake)
ep_poll_safewake(&psw, &ep->poll_wait);
return 1;
}
======================================================================
/*
* Implement the event wait interface for the eventpoll file. It is the kernel
* part of the user space epoll_wait(2).
*/
asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events,
int maxevents, int timeout)
{
int error;
struct file *file;
struct eventpoll *ep;
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d)\n",
current, epfd, events, maxevents, timeout));
/* The maximum number of event must be greater than zero */
if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
return -EINVAL;
/* Verify that the area passed by the user is writeable */
if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) {
error = -EFAULT;
goto eexit_1;
}
/* Get the "struct file *" for the eventpoll file */
error = -EBADF;
file = fget(epfd);
if (!file)
goto eexit_1;
/*
* We have to check that the file structure underneath the fd
* the user passed to us _is_ an eventpoll file.
*/
error = -EINVAL;
if (!is_file_epoll(file)) //檢查是不是epoll文件描述符: (f->f_op == &eventpoll_fops;)
goto eexit_2;
/*
* At this point it is safe to assume that the "private_data" contains
* our own data structure.
*/
ep = file->private_data;
/* Time to fish for events ... */
error = ep_poll(ep, events, maxevents, timeout);
eexit_2:
fput(file);
eexit_1:
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d) = %d\n",
current, epfd, events, maxevents, timeout, error));
return error;
}
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
int maxevents, long timeout)
{
int res, eavail;
unsigned long flags;
long jtimeout;
wait_queue_t wait;
/*
* Calculate the timeout by checking for the "infinite" value ( -1 )
* and the overflow condition. The passed timeout is in milliseconds,
* that why (t * HZ) / 1000.
*/
jtimeout = (timeout < 0 || timeout >= EP_MAX_MSTIMEO) ?
MAX_SCHEDULE_TIMEOUT : (timeout * HZ + 999) / 1000;
retry:
write_lock_irqsave(&ep->lock, flags);
res = 0;
//如果沒有ready的文件描述符,則睡眠等待被喚醒.
//我們在上面看到,它是通過poll_wait加入到設(shè)備的睡眠隊(duì)列中去的.
if (list_empty(&ep->rdllist)) {
/*
* We don't have any available event to return to the caller.
* We need to sleep here, and we will be wake up by
* ep_poll_callback() when events will become available.
*/
init_waitqueue_entry(&wait, current);
__add_wait_queue(&ep->wq, &wait); //這個本身的睡眠隊(duì)列,不同于設(shè)備的睡眠隊(duì)列
for (;;) {
/*
* We don't want to sleep if the ep_poll_callback() sends us
* a wakeup in between. That's why we set the task state
* to TASK_INTERRUPTIBLE before doing the checks.
*/
set_current_state(TASK_INTERRUPTIBLE);
if (!list_empty(&ep->rdllist) || !jtimeout)
break;
if (signal_pending(current)) {
res = -EINTR;
break;
}
write_unlock_irqrestore(&ep->lock, flags);
jtimeout = schedule_timeout(jtimeout);
write_lock_irqsave(&ep->lock, flags);
}
__remove_wait_queue(&ep->wq, &wait);
set_current_state(TASK_RUNNING);
}
/* Is it worth to try to dig for events ? */
eavail = !list_empty(&ep->rdllist);
write_unlock_irqrestore(&ep->lock, flags);
/*
* Try to transfer events to user space. In case we get 0 events and
* there's still timeout left over, we go trying again in search of
* more luck.
*/
//有數(shù)據(jù)處理
if (!res && eavail &&
!(res = ep_events_transfer(ep, events, maxevents)) && jtimeout)
goto retry;
return res;
}
/*
* Perform the transfer of events to user space.
*/
static int ep_events_transfer(struct eventpoll *ep,
struct epoll_event __user *events, int maxevents)
{
int eventcnt = 0;
struct list_head txlist;
INIT_LIST_HEAD(&txlist);
/*
* We need to lock this because we could be hit by
* eventpoll_release_file() and epoll_ctl(EPOLL_CTL_DEL).
*/
down_read(&ep->sem);
/* Collect/extract ready items */
if (ep_collect_ready_items(ep, &txlist, maxevents) > 0) {
/* Build result set in userspace */
eventcnt = ep_send_events(ep, &txlist, events);
/* Reinject ready items into the ready list */
ep_reinject_items(ep, &txlist);
}
up_read(&ep->sem);
return eventcnt;
}
/*
* Since we have to release the lock during the __copy_to_user() operation and
* during the f_op->poll() call, we try to collect the maximum number of items
* by reducing the irqlock/irqunlock switching rate.
*/
static int ep_collect_ready_items(struct eventpoll *ep, struct list_head *txlist, int maxevents)
{
int nepi;
unsigned long flags;
struct list_head *lsthead = &ep->rdllist, *lnk;
struct epitem *epi;
write_lock_irqsave(&ep->lock, flags);
//將數(shù)據(jù)從ep->rdllist隊(duì)列移到txlist隊(duì)列
for (nepi = 0, lnk = lsthead->next; lnk != lsthead && nepi < maxevents;) {
epi = list_entry(lnk, struct epitem, rdllink);
lnk = lnk->next;
/* If this file is already in the ready list we exit soon */
if (!ep_is_linked(&epi->txlink)) {
/*
* This is initialized in this way so that the default
* behaviour of the reinjecting code will be to push back
* the item inside the ready list.
*/
//待處理事件
epi->revents = epi->event.events;
/* Link the ready item into the transfer list */
list_add(&epi->txlink, txlist);
nepi++;
/*
* Unlink the item from the ready list.
*/
ep_list_del(&epi->rdllink); //脫鏈
}
}
write_unlock_irqrestore(&ep->lock, flags);
return nepi;
}
/*
* This function is called without holding the "ep->lock" since the call to
* __copy_to_user() might sleep, and also f_op->poll() might reenable the IRQ
* because of the way poll() is traditionally implemented in Linux.
*/
static int ep_send_events(struct eventpoll *ep, struct list_head *txlist,
struct epoll_event __user *events)
{
int eventcnt = 0;
unsigned int revents;
struct list_head *lnk;
struct epitem *epi;
/*
* We can loop without lock because this is a task private list.
* The test done during the collection loop will guarantee us that
* another task will not try to collect this file. Also, items
* cannot vanish during the loop because we are holding "sem".
*/
list_for_each(lnk, txlist) {
epi = list_entry(lnk, struct epitem, txlink);
/*
* Get the ready file event set. We can safely use the file
* because we are holding the "sem" in read and this will
* guarantee that both the file and the item will not vanish.
*/
//返回事件
revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);
/*
* Set the return event set for the current file descriptor.
* Note that only the task task was successfully able to link
* the item to its "txlist" will write this field.
*/
epi->revents = revents & epi->event.events; //得到所關(guān)心的事情
if (epi->revents) { //拷貝給用戶
if (__put_user(epi->revents,
&events[eventcnt].events) ||
__put_user(epi->event.data,
&events[eventcnt].data))
return -EFAULT;
if (epi->event.events & EPOLLONESHOT)
epi->event.events &= EP_PRIVATE_BITS;
eventcnt++;
}
}
return eventcnt;
}
//如下, 我們可以看到, ep_eventpoll_poll()是一個文件系統(tǒng)的poll具體實(shí)現(xiàn),
//它是設(shè)備或文件系統(tǒng)的一個實(shí)現(xiàn)樣本.
//epoll實(shí)現(xiàn)具體的poll()有兩個目的, 一是sys_epoll_wait()中檢查這個描述符是
//不是epoll描述符, (f->f_op == &eventpoll_fops;). 二是用來實(shí)現(xiàn)這個特殊的epoll
//文件描述符也可以加入其他的epoll/poll/select的描述符集中.
/* File callbacks that implement the eventpoll file behaviour */
static const struct file_operations eventpoll_fops = {
.release = ep_eventpoll_close,
.poll = ep_eventpoll_poll
};
static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
{
unsigned int pollflags = 0;
unsigned long flags;
struct eventpoll *ep = file->private_data;
/* Insert inside our poll wait queue */
poll_wait(file, &ep->poll_wait, wait);
/* Check our condition */
read_lock_irqsave(&ep->lock, flags);
if (!list_empty(&ep->rdllist))
pollflags = POLLIN | POLLRDNORM;
read_unlock_irqrestore(&ep->lock, flags);
return pollflags;
}
|