明白了進(jìn)程的基本概念之后,,我們來看一看Linux是怎么實(shí)現(xiàn)進(jìn)程的,。按照標(biāo)準(zhǔn)的操作系統(tǒng)理論,進(jìn)程是資源分配的單位,,線程是程序執(zhí)行的單位,,內(nèi)核里用進(jìn)程控制塊(PCB Process Control Block)來管理進(jìn)程,用線程控制塊(TCB Thread Control Block)來管理線程,。那么Linux是按照這個(gè)邏輯來實(shí)現(xiàn)進(jìn)程的嗎,?我們來看一下。
/* * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument * to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are * mutually exclusive. With clone3() CLONE_PIDFD has grown a separate * field in struct clone_args and it still doesn't make sense to have * them both point at the same memory location. Performing this check * here has the advantage that we don't need to have a separate helper * to check for legacy clone(). */ if ((args->flags & CLONE_PIDFD) && (args->flags & CLONE_PARENT_SETTID) && (args->pidfd == args->parent_tid)) return -EINVAL;
/* * Determine whether and which event to report to ptracer. When * called from kernel_thread or CLONE_UNTRACED is explicitly * requested, no event is reported; otherwise, report if the event * for the type of forking is enabled. */ if (!(clone_flags & CLONE_UNTRACED)) { if (clone_flags & CLONE_VFORK) trace = PTRACE_EVENT_VFORK; elseif (args->exit_signal != SIGCHLD) trace = PTRACE_EVENT_CLONE; else trace = PTRACE_EVENT_FORK;
if (likely(!ptrace_event_enabled(current, trace))) trace = 0; }
p = copy_process(NULL, trace, NUMA_NO_NODE, args); add_latent_entropy();
if (IS_ERR(p)) return PTR_ERR(p);
/* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. */ trace_sched_process_fork(current, p);
pid = get_task_pid(p, PIDTYPE_PID); nr = pid_vnr(pid);
if (clone_flags & CLONE_PARENT_SETTID) put_user(nr, args->parent_tid);
staticintdo_execveat_common(int fd, struct filename *filename, struct user_arg_ptr argv, struct user_arg_ptr envp, int flags) { struct linux_binprm *bprm; int retval;
if (IS_ERR(filename)) return PTR_ERR(filename);
/* * We move the actual failure in case of RLIMIT_NPROC excess from * set*uid() to execve() because too many poorly written programs * don't check setuid() return code. Here we additionally recheck * whether NPROC limit is still exceeded. */ if ((current->flags & PF_NPROC_EXCEEDED) && is_ucounts_overlimit(current_ucounts(), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) { retval = -EAGAIN; goto out_ret; }
/* We're below the limit (still or again), so we don't want to make * further execve() calls fail. */ current->flags &= ~PF_NPROC_EXCEEDED;
bprm->file = file; /* * Record that a name derived from an O_CLOEXEC fd will be * inaccessible after exec. This allows the code in exec to * choose to fail when the executable is not mmaped into the * interpreter and an open file descriptor is not passed to * the interpreter. This makes for a better user experience * than having the interpreter start and then immediately fail * when it finds the executable is inaccessible. */ if (bprm->fdpath && get_close_on_exec(fd)) bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE;
/* Set the unchanging part of bprm->cred */ retval = security_bprm_creds_for_exec(bprm); if (retval) gotoout;
retval = exec_binprm(bprm); if (retval < 0) gotoout;
out: /* * If past the point of no return ensure the code never * returns to the userspace process. Use an existing fatal * signal if present otherwise terminate the process with * SIGSEGV. */ if (bprm->point_of_no_return && !fatal_signal_pending(current)) force_fatal_sig(SIGSEGV);
staticintexec_binprm(struct linux_binprm *bprm) { pid_t old_pid, old_vpid; int ret, depth;
/* Need to fetch pid before load_binary changes it */ old_pid = current->pid; rcu_read_lock(); old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent)); rcu_read_unlock();
/* This allows 4 levels of binfmt rewrites before failing hard. */ for (depth = 0;; depth++) { struct file *exec; if (depth > 5) return -ELOOP;
ret = search_binary_handler(bprm); if (ret < 0) return ret; if (!bprm->interpreter) break;
if (need_retry) { if (printable(bprm->buf[0]) && printable(bprm->buf[1]) && printable(bprm->buf[2]) && printable(bprm->buf[3])) return retval; if (request_module('binfmt-%04x', *(ushort *)(bprm->buf + 2)) < 0) return retval; need_retry = false; goto retry; }
return retval; }
linux-src/fs/binfmt_elf.c
static int load_elf_binary(struct linux_binprm *bprm) { struct file *interpreter = NULL; /* to shut gcc up */ unsigned long load_addr = 0, load_bias = 0; int load_addr_set = 0; unsigned long error; struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL; struct elf_phdr *elf_property_phdata = NULL; unsigned long elf_bss, elf_brk; int bss_prot = 0; int retval, i; unsigned long elf_entry; unsigned long e_entry; unsigned long interp_load_addr = 0; unsigned long start_code, end_code, start_data, end_data; unsigned long reloc_func_desc __maybe_unused = 0; int executable_stack = EXSTACK_DEFAULT; struct elfhdr *elf_ex = (struct elfhdr *)bprm->buf; struct elfhdr *interp_elf_ex = NULL; struct arch_elf_state arch_state = INIT_ARCH_ELF_STATE; struct mm_struct *mm; struct pt_regs *regs;
retval = -ENOEXEC; /* First of all, some simple consistency checks */ if (memcmp(elf_ex->e_ident, ELFMAG, SELFMAG) != 0) goto out;
if (elf_ex->e_type != ET_EXEC && elf_ex->e_type != ET_DYN) goto out; if (!elf_check_arch(elf_ex)) goto out; if (elf_check_fdpic(elf_ex)) goto out; if (!bprm->file->f_op->mmap) goto out;
elf_phdata = load_elf_phdrs(elf_ex, bprm->file); if (!elf_phdata) goto out;
elf_ppnt = elf_phdata; for (i = 0; i < elf_ex->e_phnum; i++, elf_ppnt++) { char *elf_interpreter;
if (elf_ppnt->p_type == PT_GNU_PROPERTY) { elf_property_phdata = elf_ppnt; continue; }
if (elf_ppnt->p_type != PT_INTERP) continue;
/* * This is the program interpreter used for shared libraries - * for now assume that this is an a.out format binary. */ retval = -ENOEXEC; if (elf_ppnt->p_filesz > PATH_MAX || elf_ppnt->p_filesz < 2) goto out_free_ph;
elf_ppnt = elf_phdata; for (i = 0; i < elf_ex->e_phnum; i++, elf_ppnt++) switch (elf_ppnt->p_type) { case PT_GNU_STACK: if (elf_ppnt->p_flags & PF_X) executable_stack = EXSTACK_ENABLE_X; else executable_stack = EXSTACK_DISABLE_X; break;
case PT_LOPROC ... PT_HIPROC: retval = arch_elf_pt_proc(elf_ex, elf_ppnt, bprm->file, false, &arch_state); if (retval) goto out_free_dentry; break; }
/* Some simple consistency checks for the interpreter */ if (interpreter) { retval = -ELIBBAD; /* Not an ELF interpreter */ if (memcmp(interp_elf_ex->e_ident, ELFMAG, SELFMAG) != 0) goto out_free_dentry; /* Verify the interpreter has a valid arch */ if (!elf_check_arch(interp_elf_ex) || elf_check_fdpic(interp_elf_ex)) goto out_free_dentry;
/* Load the interpreter program headers */ interp_elf_phdata = load_elf_phdrs(interp_elf_ex, interpreter); if (!interp_elf_phdata) goto out_free_dentry;
/* Pass PT_LOPROC..PT_HIPROC headers to arch code */ elf_property_phdata = NULL; elf_ppnt = interp_elf_phdata; for (i = 0; i < interp_elf_ex->e_phnum; i++, elf_ppnt++) switch (elf_ppnt->p_type) { case PT_GNU_PROPERTY: elf_property_phdata = elf_ppnt; break;
case PT_LOPROC ... PT_HIPROC: retval = arch_elf_pt_proc(interp_elf_ex, elf_ppnt, interpreter, true, &arch_state); if (retval) goto out_free_dentry; break; } }
/* * Allow arch code to reject the ELF at this point, whilst it's * still possible to return an error to the code that invoked * the exec syscall. */ retval = arch_check_elf(elf_ex, !!interpreter, interp_elf_ex, &arch_state); if (retval) goto out_free_dentry;
/* Flush all traces of the currently running executable */ retval = begin_new_exec(bprm); if (retval) goto out_free_dentry;
/* Do this immediately, since STACK_TOP as used in setup_arg_pages may depend on the personality. */ SET_PERSONALITY2(*elf_ex, &arch_state); if (elf_read_implies_exec(*elf_ex, executable_stack)) current->personality |= READ_IMPLIES_EXEC;
if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) current->flags |= PF_RANDOMIZE;
setup_new_exec(bprm);
/* Do this so that we can load the interpreter, if need be. We will change some of these later */ retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP), executable_stack); if (retval < 0) goto out_free_dentry;
/* Now we do a little grungy work by mmapping the ELF image into the correct location in memory. */ for(i = 0, elf_ppnt = elf_phdata; i < elf_ex->e_phnum; i++, elf_ppnt++) { int elf_prot, elf_flags; unsigned long k, vaddr; unsigned long total_size = 0; unsigned long alignment;
if (elf_ppnt->p_type != PT_LOAD) continue;
if (unlikely (elf_brk > elf_bss)) { unsigned long nbyte;
/* There was a PT_LOAD segment with p_memsz > p_filesz before this one. Map anonymous pages, if needed, and clear the area. */ retval = set_brk(elf_bss + load_bias, elf_brk + load_bias, bss_prot); if (retval) goto out_free_dentry; nbyte = ELF_PAGEOFFSET(elf_bss); if (nbyte) { nbyte = ELF_MIN_ALIGN - nbyte; if (nbyte > elf_brk - elf_bss) nbyte = elf_brk - elf_bss; if (clear_user((void __user *)elf_bss + load_bias, nbyte)) { /* * This bss-zeroing can fail if the ELF * file specifies odd protections. So * we don't check the return value */ } } }
vaddr = elf_ppnt->p_vaddr; /* * If we are loading ET_EXEC or we have already performed * the ET_DYN load_addr calculations, proceed normally. */ if (elf_ex->e_type == ET_EXEC || load_addr_set) { elf_flags |= MAP_FIXED; } elseif (elf_ex->e_type == ET_DYN) { /* * This logic is run once for the first LOAD Program * Header for ET_DYN binaries to calculate the * randomization (load_bias) for all the LOAD * Program Headers, and to calculate the entire * size of the ELF mapping (total_size). (Note that * load_addr_set is set to true later once the * initial mapping is performed.) * * There are effectively two types of ET_DYN * binaries: programs (i.e. PIE: ET_DYN with INTERP) * and loaders (ET_DYN without INTERP, since they * _are_ the ELF interpreter). The loaders must * be loaded away from programs since the program * may otherwise collide with the loader (especially * for ET_EXEC which does not have a randomized * position). For example to handle invocations of * './ld.so someprog' to test out a new version of * the loader, the subsequent program that the * loader loads must avoid the loader itself, so * they cannot share the same load range. Sufficient * room for the brk must be allocated with the * loader as well, since brk must be available with * the loader. * * Therefore, programs are loaded offset from * ELF_ET_DYN_BASE and loaders are loaded into the * independently randomized mmap region (0 load_bias * without MAP_FIXED). */ if (interpreter) { load_bias = ELF_ET_DYN_BASE; if (current->flags & PF_RANDOMIZE) load_bias += arch_mmap_rnd(); alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum); if (alignment) load_bias &= ~(alignment - 1); elf_flags |= MAP_FIXED; } else load_bias = 0;
/* * Since load_bias is used for all subsequent loading * calculations, we must lower it by the first vaddr * so that the remaining calculations based on the * ELF vaddrs will be correctly offset. The result * is then page aligned. */ load_bias = ELF_PAGESTART(load_bias - vaddr);
if (!load_addr_set) { load_addr_set = 1; load_addr = (elf_ppnt->p_vaddr - elf_ppnt->p_offset); if (elf_ex->e_type == ET_DYN) { load_bias += error - ELF_PAGESTART(load_bias + vaddr); load_addr += load_bias; reloc_func_desc = load_bias; } } k = elf_ppnt->p_vaddr; if ((elf_ppnt->p_flags & PF_X) && k < start_code) start_code = k; if (start_data < k) start_data = k;
/* * Check to see if the section's size will overflow the * allowed task size. Note that p_filesz must always be * <= p_memsz so it is only necessary to check p_memsz. */ if (BAD_ADDR(k) || elf_ppnt->p_filesz > elf_ppnt->p_memsz || elf_ppnt->p_memsz > TASK_SIZE || TASK_SIZE - elf_ppnt->p_memsz < k) { /* set_brk can never work. Avoid overflows. */ retval = -EINVAL; goto out_free_dentry; }
k = elf_ppnt->p_vaddr + elf_ppnt->p_filesz;
if (k > elf_bss) elf_bss = k; if ((elf_ppnt->p_flags & PF_X) && end_code < k) end_code = k; if (end_data < k) end_data = k; k = elf_ppnt->p_vaddr + elf_ppnt->p_memsz; if (k > elf_brk) { bss_prot = elf_prot; elf_brk = k; } }
/* Calling set_brk effectively mmaps the pages that we need * for the bss and break sections. We must do this before * mapping in the interpreter, to make sure it doesn't wind * up getting placed where the bss needs to go. */ retval = set_brk(elf_bss, elf_brk, bss_prot); if (retval) goto out_free_dentry; if (likely(elf_bss != elf_brk) && unlikely(padzero(elf_bss))) { retval = -EFAULT; /* Nobody gets to see this, but.. */ goto out_free_dentry; }
if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) { /* * For architectures with ELF randomization, when executing * a loader directly (i.e. no interpreter listed in ELF * headers), move the brk area out of the mmap region * (since it grows up, and may collide early with the stack * growing down), and into the unused ELF_ET_DYN_BASE region. */ if (IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) && elf_ex->e_type == ET_DYN && !interpreter) { mm->brk = mm->start_brk = ELF_ET_DYN_BASE; }
if (current->personality & MMAP_PAGE_ZERO) { /* Why this, you ask??? Well SVr4 maps page 0 as read-only, and some applications 'depend' upon this behavior. Since we do not have the power to recompile these, we emulate the SVr4 behavior. Sigh. */ error = vm_mmap(NULL, 0, PAGE_SIZE, PROT_READ | PROT_EXEC, MAP_FIXED | MAP_PRIVATE, 0); }
regs = current_pt_regs(); #ifdef ELF_PLAT_INIT /* * The ABI may specify that certain registers be set up in special * ways (on i386 %edx is the address of a DT_FINI function, for * example. In addition, it may also specify (eg, PowerPC64 ELF) * that the e_entry field is the address of the function descriptor * for the startup routine, rather than the address of the startup * routine itself. This macro performs whatever initialization to * the regs structure is required as well as any relocations to the * function descriptor entries when executing dynamically links apps. */ ELF_PLAT_INIT(regs, reloc_func_desc); #endif
spin_lock_irq(&sighand->siglock); if (signal_group_exit(sig)) /* Another thread got here before we took the lock. */ exit_code = sig->group_exit_code; else { sig->group_exit_code = exit_code; sig->flags = SIGNAL_GROUP_EXIT; zap_other_threads(current); } spin_unlock_irq(&sighand->siglock); }
/* * We can get here from a kernel oops, sometimes with preemption off. * Start by checking for critical errors. * Then fix up important state like USER_DS and preemption. * Then do everything else. */
WARN_ON(blk_needs_flush_plug(tsk));
if (unlikely(in_interrupt())) panic('Aiee, killing interrupt handler!'); if (unlikely(!tsk->pid)) panic('Attempted to kill the idle task!');
/* * If do_exit is called because this processes oopsed, it's possible * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before * continuing. Amongst other possible reasons, this is to prevent * mm_release()->clear_child_tid() from writing to a user-controlled * kernel address. */ force_uaccess_begin();
if (unlikely(in_atomic())) { pr_info('note: %s[%d] exited with preempt_count %d\n', current->comm, task_pid_nr(current), preempt_count()); preempt_count_set(PREEMPT_ENABLED); }
profile_task_exit(tsk); kcov_task_exit(tsk);
ptrace_event(PTRACE_EVENT_EXIT, code);
validate_creds_for_do_exit(tsk);
/* * We're taking recursive faults here in do_exit. Safest is to just * leave this task alone and wait for reboot. */ if (unlikely(tsk->flags & PF_EXITING)) { pr_alert('Fixing recursive fault but reboot is needed!\n'); futex_exit_recursive(tsk); set_current_state(TASK_UNINTERRUPTIBLE); schedule(); }
/* sync mm's RSS info before statistics gathering */ if (tsk->mm) sync_mm_rss(tsk->mm); acct_update_integrals(tsk); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { /* * If the last thread of global init has exited, panic * immediately to get a useable coredump. */ if (unlikely(is_global_init(tsk))) panic('Attempted to kill init! exitcode=0x%08x\n', tsk->signal->group_exit_code ?: (int)code);
#ifdef CONFIG_POSIX_TIMERS hrtimer_cancel(&tsk->signal->real_timer); exit_itimers(tsk->signal); #endif if (tsk->mm) setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm); } acct_collect(code, group_dead); if (group_dead) tty_audit_exit(); audit_free(tsk);
if (group_dead) acct_process(); trace_sched_process_exit(tsk);
exit_sem(tsk); exit_shm(tsk); exit_files(tsk); exit_fs(tsk); if (group_dead) disassociate_ctty(1); exit_task_namespaces(tsk); exit_task_work(tsk); exit_thread(tsk);
/* * Flush inherited counters to the parent - before the parent * gets woken up by child-exit notifications. * * because of cgroup mode, must be called before cgroup_exit() */ perf_event_exit_task(tsk);
sched_autogroup_exit_task(tsk); cgroup_exit(tsk);
/* * FIXME: do that only when needed, using sched_exit tracepoint */ flush_ptrace_hw_breakpoint(tsk);
exit_tasks_rcu_start(); exit_notify(tsk, group_dead); proc_exit_connector(tsk); mpol_put_task_policy(tsk); #ifdef CONFIG_FUTEX if (unlikely(current->pi_state_cache)) kfree(current->pi_state_cache); #endif /* * Make sure we are holding no locks: */ debug_check_no_locks_held();
if (tsk->io_context) exit_io_context(tsk);
if (tsk->splice_pipe) free_pipe_info(tsk->splice_pipe);
if (tsk->task_frag.page) put_page(tsk->task_frag.page);
validate_creds_for_do_exit(tsk);
check_stack_usage(); preempt_disable(); if (tsk->nr_dirtied) __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); exit_rcu(); exit_tasks_rcu_finish();
《Linux Kernel Development》 《Understanding the Linux Kernel》 《Professional Linux Kernel Architecture》 《The Linux Programming Interface》 《Advanced Programming in the UNIX Environment》 《Linkers & Loaders》 《程序員的自我修養(yǎng)》 《深度探索Linux操作系統(tǒng)》