Redian新闻
>
干货!Linux 内核学习笔记

干货!Linux 内核学习笔记

公众号新闻

本文由看雪论坛 e*16 a 原创,转载请注明来自看雪社区
以下是基于linux0.11的代码。





内核的五大结构






中断工作流程


1、ARM回忆


(1)做CPU工作模式的转化
(2)进行寄存器的拷贝与压栈
(3)设置中断向量表
(4)保存正常运行的函数返回值
(5)跳转到对应的中断服务函数上运行
(6)进行模式的复原及寄存器的复原
(7)跳转回正常工作的函数地址继续运行


2、linux中中断的工作流程


(1)将所有寄存器值入栈
(2)将异常吗入栈(中断号)
(3)将当前函数的返回地址入栈
(4)调用中断函数
(5)返回地址出栈
(6)寄存器值出栈


3、中断源码


中断前后的处理 中断的执行
硬件中断的处理过程 asm.s trap.c
软件及系统调用的处理过程 system_call.s fork.c/signal.c/exit.c/sys.c

① asm.s代码及trap.c分析 (OPENING)

② system_call.s代码及fork.c/signal.c/exit.c/sys.c分析


(1) fork.c


在system_call.s内有存在fork的系统调用,先call _find_empty_process,然后call _copy_process。
.align 2_sys_fork:call _find_empty_processtestl %eax,%eaxjs 1fpush %gspushl %esipushl %edipushl %ebppushl %eaxcall _copy_processaddl $20,%esp1: ret

#include <errno.h>#include <linux/sched.h>#include <linux/kernel.h>#include <asm/segment.h>#include <asm/system.h>
extern void write_verify(unsigned long address);
long last_pid=0;
void verify_area(void * addr,int size){unsigned long start;
start = (unsigned long) addr; size += start & 0xfff; start &= 0xfffff000; start += get_base(current->ldt[2]);while (size>0) { size -= 4096; write_verify(start); start += 4096; }}
int copy_mem(int nr,struct task_struct * p){unsigned long old_data_base,new_data_base,data_limit;unsigned long old_code_base,new_code_base,code_limit;
code_limit=get_limit(0x0f); data_limit=get_limit(0x17); old_code_base = get_base(current->ldt[1]); old_data_base = get_base(current->ldt[2]);if (old_data_base != old_code_base) panic("We don't support separate I&D");if (data_limit < code_limit) panic("Bad data_limit"); new_data_base = new_code_base = nr * 0x4000000; p->start_code = new_code_base; set_base(p->ldt[1],new_code_base); set_base(p->ldt[2],new_data_base);if (copy_page_tables(old_data_base,new_data_base,data_limit)) { free_page_tables(new_data_base,data_limit);return -ENOMEM; }return 0;}
/* * Ok, this is the main fork-routine. It copies the system process * information (task[nr]) and sets up the necessary registers. It * also copies the data segment in it's entirety. */int copy_process(int nr,long ebp,long edi,long esi,long gs,long none,long ebx,long ecx,long edx,long fs,long es,long ds,long eip,long cs,long eflags,long esp,long ss){struct task_struct *p; //创建子进程的task_struct结构体int i;struct file *f;
p = (struct task_struct *) get_free_page();if (!p)return -EAGAIN; task[nr] = p; //将子进程存到task链表中 *p = *current; /* NOTE! this doesn't copy the supervisor stack *///下面开始设置结构体内容 p->state = TASK_UNINTERRUPTIBLE; p->pid = last_pid; p->father = current->pid; p->counter = p->priority; p->signal = 0; p->alarm = 0; p->leader = 0; /* process leadership doesn't inherit */ p->utime = p->stime = 0; p->cutime = p->cstime = 0; p->start_time = jiffies; p->tss.back_link = 0; p->tss.esp0 = PAGE_SIZE + (long) p; p->tss.ss0 = 0x10; p->tss.eip = eip; p->tss.eflags = eflags; p->tss.eax = 0; p->tss.ecx = ecx; p->tss.edx = edx; p->tss.ebx = ebx; p->tss.esp = esp; p->tss.ebp = ebp; p->tss.esi = esi; p->tss.edi = edi; p->tss.es = es & 0xffff; p->tss.cs = cs & 0xffff; p->tss.ss = ss & 0xffff; p->tss.ds = ds & 0xffff; p->tss.fs = fs & 0xffff; p->tss.gs = gs & 0xffff; p->tss.ldt = _LDT(nr); p->tss.trace_bitmap = 0x80000000;if (last_task_used_math == current) __asm__("clts ; fnsave %0"::"m" (p->tss.i387)); //如果父进程用了协处理器,需要在tss段进行设置if (copy_mem(nr,p)) { //内存拷贝 task[nr] = NULL; free_page((long) p);return -EAGAIN; }for (i=0; i<NR_OPEN;i++)if (f=p->filp[i]) f->f_count++;if (current->pwd) current->pwd->i_count++;if (current->root) current->root->i_count++;if (current->executable) current->executable->i_count++; set_tss_desc(gdt+(nr<<1)+FIRST_TSS_ENTRY,&(p->tss)); set_ldt_desc(gdt+(nr<<1)+FIRST_LDT_ENTRY,&(p->ldt)); p->state = TASK_RUNNING; /* do this last, just in case */return last_pid; }
int find_empty_process(void){int i;
repeat:if ((++last_pid)<0) last_pid=1;for(i=0 ; i<NR_TASKS ; i++)if (task[i] && task[i]->pid == last_pid) goto repeat;for(i=1 ; i<NR_TASKS ; i++)if (!task[i]) return i;return -EAGAIN;}

① 在task链表中找一个进程空位存放
② 创建一个task_struct
③ 设置task_struct


(2)signal.c


这里只是进行一个简单的分析,详细分析请见第五章。
#include <linux/sched.h>#include <linux/kernel.h>#include <asm/segment.h>
#include <signal.h>
volatile void do_exit(int error_code);
int sys_sgetmask(){return current->blocked;}
int sys_ssetmask(int newmask){int old=current->blocked;
current->blocked = newmask & ~(1<<(SIGKILL-1));return old;}
static inline void save_old(char * from,char * to){int i;
verify_area(to, sizeof(struct sigaction));for (i=0 ; i< sizeof(struct sigaction) ; i++) { put_fs_byte(*from,to); from++; to++; }}
static inline void get_new(char * from,char * to){int i;
for (i=0 ; i< sizeof(struct sigaction) ; i++) *(to++) = get_fs_byte(from++);}
int sys_signal(int signum, long handler, long restorer){struct sigaction tmp;
if (signum<1 || signum>32 || signum==SIGKILL) //判断信号值是否合法return -1; tmp.sa_handler = (void (*)(int)) handler; tmp.sa_mask = 0; tmp.sa_flags = SA_ONESHOT | SA_NOMASK; tmp.sa_restorer = (void (*)(void)) restorer; //设置sigaction结构体 handler = (long) current->sigaction[signum-1].sa_handler; current->sigaction[signum-1] = tmp; //将当前进程对应的信号结构体改为新分配的结构体return handler; //返回处理函数}
int sys_sigaction(int signum, const struct sigaction * action, struct sigaction * oldaction){struct sigaction tmp;
if (signum<1 || signum>32 || signum==SIGKILL)return -1; tmp = current->sigaction[signum-1]; get_new((char *) action, (char *) (signum-1+current->sigaction));if (oldaction) save_old((char *) &tmp,(char *) oldaction);if (current->sigaction[signum-1].sa_flags & SA_NOMASK) current->sigaction[signum-1].sa_mask = 0;else current->sigaction[signum-1].sa_mask |= (1<<(signum-1));return 0;}
void do_signal(long signr,long eax, long ebx, long ecx, long edx,long fs, long es, long ds,long eip, long cs, long eflags,unsigned long * esp, long ss){unsigned long sa_handler;long old_eip=eip;struct sigaction * sa = current->sigaction + signr - 1;int longs;unsigned long * tmp_esp;
sa_handler = (unsigned long) sa->sa_handler;if (sa_handler==1)return;if (!sa_handler) {if (signr==SIGCHLD)return;else do_exit(1<<(signr-1)); }if (sa->sa_flags & SA_ONESHOT) sa->sa_handler = NULL; *(&eip) = sa_handler; longs = (sa->sa_flags & SA_NOMASK)?7:8; *(&esp) -= longs; verify_area(esp,longs*4); tmp_esp=esp; put_fs_long((long) sa->sa_restorer,tmp_esp++); put_fs_long(signr,tmp_esp++);if (!(sa->sa_flags & SA_NOMASK)) put_fs_long(current->blocked,tmp_esp++); put_fs_long(eax,tmp_esp++); put_fs_long(ecx,tmp_esp++); put_fs_long(edx,tmp_esp++); put_fs_long(eflags,tmp_esp++); put_fs_long(old_eip,tmp_esp++); current->blocked |= sa->sa_mask;}


// Line 12#define SIGHUP 1 // 挂断控制终端或进程#define SIGINT 2 // 键盘中断#define SIGQUIT 3 // 键盘退出#define SIGILL 4 // 非法指令#define SIGTRAP 5 // 跟踪断点#define SIGABRT 6 // 异常结束#define SIGIOT 6 // 异常结束#define SIGUNUSED 7 // 未使用#define SIGFPE 8 // 协处理器错误#define SIGKILL 9 // 终止进程#define SIGUSR1 10 // 用户信号 1#define SIGSEGV 11 // 无效的内存引用#define SIGUSR2 12 // 用户信号 2#define SIGPIPE 13 // 管道写出错,读端全关闭#define SIGALRM 14 // 定时器警报#define SIGTERM 15 // 进程终止#define SIGSTKFLT 16 // 栈出错#define SIGCHLD 17 // 子进程状态改变#define SIGCONT 18 // 恢复进程继续执行#define SIGSTOP 19 // 暂停进程执行#define SIGTSTP 20 // tty 发出的停止进程信号#define SIGTTIN 21 // 后台进程请求输入#define SIGTTOU 22 // 后台进程请求输出
// Line 37#define SA_NOCLDSTOP 1 // 当子进程处于停止状态,就不对 SIGCHLD 信号做处理#define SA_NOMASK 0x40000000 // 允许在指定信号处理程序中再次收到该信号#define SA_ONESHOT 0x80000000 // 信号句柄一旦被调用过就恢复默认处理函数
// Line 45#define SIG_DFL ((void (*)(int))0) // 默认处理程序#define SIG_IGN ((void (*)(int))1) // 忽略信号对应的处理程序typedef unsigned int sigset_t;
struct sigaction {void (*sa_handler)(int); // 信号处理程序指针sigset_t sa_mask; // 指出当前信号处理程序执行期间需要被屏蔽的信号int sa_flags; // 从 37 行的三个定义中选出void (*sa_restorer)(void); // 恢复函数指针,由 libc 提供};


(3)exit.c

#include <errno.h>#include <signal.h>#include <sys/wait.h>
#include <linux/sched.h>#include <linux/kernel.h>#include <linux/tty.h>#include <asm/segment.h>
int sys_pause(void);int sys_close(int fd);
void release(struct task_struct * p) //释放进程p{int i;
if (!p)return;for (i=1 ; i<NR_TASKS ; i++)if (task[i]==p) { task[i]=NULL; free_page((long)p); //释放内存页 schedule(); //之后重新进行进程调度return; } panic("trying to release non-existent task");}
static inline int send_sig(long sig,struct task_struct * p,int priv){if (!p || sig<1 || sig>32)return -EINVAL;if (priv || (current->euid==p->euid) || suser()) p->signal |= (1<<(sig-1)); //给p进程发送信号elsereturn -EPERM;return 0;}
static void kill_session(void) //关闭对话函数{struct task_struct **p = NR_TASKS + task; //获得task数组最后一个任务
while (--p > &FIRST_TASK) { //从最后一个向前遍历if (*p && (*p)->session == current->session) //如果遍历到当前的任务 (*p)->signal |= 1<<(SIGHUP-1); //则将SIGHUP挂断信号发送给当前任务 }}
/* * XXX need to check permissions needed to send signals to process * groups, etc. etc. kill() permissions semantics are tricky! */int sys_kill(int pid,int sig) //linux命令kill不是杀死的意思,是向某进程发送任何信号{struct task_struct **p = NR_TASKS + task; //指向最后int err, retval = 0;
// 注:每个进程组都有一个组长进程,组长进程的进程号等于进程组ID
if (!pid) while (--p > &FIRST_TASK) { //如果pid为0,进入循环if (*p && (*p)->pgrp == current->pid) //向进程组的所有成员发送信号if (err=send_sig(sig,*p,1)) retval = err; }else if (pid>0) while (--p > &FIRST_TASK) { //如果pid大于0if (*p && (*p)->pid == pid) //仅向pid进程发送信号if (err=send_sig(sig,*p,0)) retval = err; }else if (pid == -1) while (--p > &FIRST_TASK) //如果pid=-1if (err = send_sig(sig,*p,0)) //向除0号进程外的进程发送信号 retval = err;else while (--p > &FIRST_TASK) //如果pid<-1if (*p && (*p)->pgrp == -pid) //向进程组号为-pid的进程组发送信号if (err = send_sig(sig,*p,0)) retval = err;return retval;}
static void tell_father(int pid) //传入参数为父进程的pid{int i;
if (pid)for (i=0;i<NR_TASKS;i++) {if (!task[i])continue;if (task[i]->pid != pid)continue; task[i]->signal |= (1<<(SIGCHLD-1)); //SIGCHLD=17return; }/* if we don't find any fathers, we just release ourselves *//* This is not really OK. Must change it to make father 1 */ printk("BAD BAD - no father found\n\r"); release(current); //释放子进程}
int do_exit(long code){int i;//#define LDT_NUL 0//#define LDT_CODE 1//#define LDT_DATA 2 free_page_tables(get_base(current->ldt[1]),get_limit(0x0f)); //释放当前进程的CODE段所占用的内存页 free_page_tables(get_base(current->ldt[2]),get_limit(0x17));for (i=0 ; i<NR_TASKS ; i++) //从前向后遍历if (task[i] && task[i]->father == current->pid) { //若当前进程就是某个进程的父进程时; task[i]->father = 1; //就让1号进程作为某个进程的父进程(因为current这个进程将会exit)if (task[i]->state == TASK_ZOMBIE) //若某进程是僵死状态/* assumption task[1] is always init */ (void) send_sig(SIGCHLD, task[1], 1); //给1号进程发送信号 }for (i=0 ; i<NR_OPEN ; i++)if (current->filp[i]) //关闭当前进程打开的所有文件 sys_close(i); iput(current->pwd); //把当前进程的路径放回i节点并置空 current->pwd=NULL; iput(current->root); current->root=NULL; iput(current->executable); current->executable=NULL;if (current->leader && current->tty >= 0) //若当前进程是进程组的头头,并且拥有tty终端 tty_table[current->tty].pgrp = 0; //释放该终端if (last_task_used_math == current) last_task_used_math = NULL;if (current->leader) kill_session(); //关闭session current->state = TASK_ZOMBIE; //设置成僵死状态 current->exit_code = code; tell_father(current->father); //向当前进程的父进程发送 SIGCHLD 信号 schedule();return (-1); /* just to suppress warnings */}
int sys_exit(int error_code){return do_exit((error_code&0xff)<<8);}
int sys_waitpid(pid_t pid,unsigned long * stat_addr, int options){int flag, code;struct task_struct ** p;
verify_area(stat_addr,4);repeat: flag=0;for(p = &LAST_TASK ; p > &FIRST_TASK ; --p) {if (!*p || *p == current) //若该项为空或者该项是当前进程,则跳过continue;if ((*p)->father != current->pid) //若该项的父进程不是当前进程,则跳过continue;if (pid>0) { //若pid>0if ((*p)->pid != pid) //若该项的pid不是waitpid传进来的pid参数,则跳过continue; } else if (!pid) { //若pid=0,if ((*p)->pgrp != current->pgrp) //若当前项不在当前进程组,则跳过continue; } else if (pid != -1) { //若pid<-1if ((*p)->pgrp != -pid) //若当前项不在-pid的进程组,则跳过continue; }switch ((*p)->state) { //若pid=-1,则直接来到switch;判断所选进程p的状态case TASK_STOPPED: //若是停止状态if (!(options & WUNTRACED)) //continue; put_fs_long(0x7f,stat_addr);return (*p)->pid;case TASK_ZOMBIE: current->cutime += (*p)->utime; current->cstime += (*p)->stime; flag = (*p)->pid; code = (*p)->exit_code; release(*p); put_fs_long(code,stat_addr);return flag;default: //p是睡眠或运行状态,设置flag为1 flag=1;continue; } }if (flag) {if (options & WNOHANG) //WNOHANG 表示若没有子进程处于退出或终止态就返回return 0; current->state=TASK_INTERRUPTIBLE; //否则将当前进程的状态置为可中断睡眠态 schedule();if (!(current->signal &= ~(1<<(SIGCHLD-1))))goto repeat;elsereturn -EINTR; }return -ECHILD;}

do_exit()

① 释放进程的代码段和数据段占用的内存。

② 关闭进程打开的所有文件,对当前目录和i节点进行同步(文件操作)。

③ 如果当前要销毁的进程有子进程,就让1号进程作为新的父进程。

④ 如果当前进程是一个会话头进程,则会终止会话中的所有进程。

⑤ 改变当前进程的运行状态,变成TASK_ZOMBIE(僵死)状态,并且向其父进程发送SIGCHLD信号,说明自己要死了。

sys_waitpid()

① 父进程在运行子进程时一般都会运行wait waitpid这两个函数,用来父进程等待子进程终止。

② 当父进程收到SIGCHLD信号时,父进程会终止僵死状态的子进程。

③ 父进程会把子进程的运行时间累加到自己的运行时间上。

④ 把对应子进程的进程描述结构体进行释放,置空数组空槽。





进程


1.内核进程初始化与创建


每创建一个进程就对应着一个task_struct结构体。
struct task_struct {/* these are hardcoded - don't touch */long state; /* -1 unrunnable, 0 runnable, >0 stopped */long counter; long priority;long signal;struct sigaction sigaction[32];long blocked; /* bitmap of masked signals *//* various fields */int exit_code;unsigned long start_code,end_code,end_data,brk,start_stack;long pid,father,pgrp,session,leader;unsigned short uid,euid,suid;unsigned short gid,egid,sgid;long alarm;long utime,stime,cutime,cstime,start_time;unsigned short used_math;/* file system info */int tty; /* -1 if no tty, so it must be signed */unsigned short umask;struct m_inode * pwd;struct m_inode * root;struct m_inode * executable;unsigned long close_on_exec;struct file * filp[NR_OPEN];/* ldt for this task 0 - zero 1 - cs 2 - ds&ss */struct desc_struct ldt[3];/* tss for this task */struct tss_struct tss; //cpu运行一个进程后各个寄存器都保存在tss内};

(1)0号和1号进程的创建

Linux在初始化的过程中会进行0号进程的创建。

注:分析0.11的main函数
void main(void) /* This really IS void, no error here. */{ /* The startup routine assumes (well, ...) this *//* * Interrupts are still disabled. Do necessary setups, then * enable them */ ROOT_DEV = ORIG_ROOT_DEV; drive_info = DRIVE_INFO; memory_end = (1<<20) + (EXT_MEM_K<<10); memory_end &= 0xfffff000;if (memory_end > 16*1024*1024) memory_end = 16*1024*1024;if (memory_end > 12*1024*1024) buffer_memory_end = 4*1024*1024;else if (memory_end > 6*1024*1024) buffer_memory_end = 2*1024*1024;else buffer_memory_end = 1*1024*1024; main_memory_start = buffer_memory_end;#ifdef RAMDISK main_memory_start += rd_init(main_memory_start, RAMDISK*1024);#endif mem_init(main_memory_start,memory_end); trap_init(); blk_dev_init(); chr_dev_init(); tty_init(); time_init(); sched_init(); buffer_init(buffer_memory_end); hd_init(); floppy_init(); sti(); move_to_user_mode(); //切换到用户态if (!fork()) { /* 创建0号进程 */ init(); }for(;;) pause();}

内核要先切换到用户态之后再fork生成0号进程。
#define move_to_user_mode() \__asm__ ("movl %%esp,%%eax\n\t" \"pushl $0x17\n\t" \"pushl %%eax\n\t" \"pushfl\n\t" \"pushl $0x0f\n\t" \"pushl $1f\n\t" \"iret\n" \"1:\tmovl $0x17,%%eax\n\t" \"movw %%ax,%%ds\n\t" \"movw %%ax,%%es\n\t" \"movw %%ax,%%fs\n\t" \"movw %%ax,%%gs" \ :::"ax")

iret是从中断返回的指令,在iret之前,之前5个push压入的数据会出栈,分别赋给ss,esp,eflags,cs,eip。

fork生成0进程之后,会进行初始化,进一步分析如下:
static char * argv_rc[] = { "/bin/sh", NULL };static char * envp_rc[] = { "HOME=/", NULL };
static char * argv[] = { "-/bin/sh",NULL };static char * envp[] = { "HOME=/usr/root", NULL };
void init(void){int pid,i;
setup((void *) &drive_info); (void) open("/dev/tty0",O_RDWR,0); //tty0设备是标准输入控制台,句柄为0 (void) dup(0); (void) dup(0);printf("%d buffers = %d bytes buffer space\n\r",NR_BUFFERS, NR_BUFFERS*BLOCK_SIZE);printf("Free mem: %d bytes\n\r",memory_end-main_memory_start);if (!(pid=fork())) { //对于被创建的子进程,返回值为0,所以if里面的语句是在子进程中执行,并打开rc文件并用获得的shell在/执行rc里的命令 close(0); //关闭标准输入,所有进程共用文件描述符if (open("/etc/rc",O_RDONLY,0)) _exit(1); execve("/bin/sh",argv_rc,envp_rc); _exit(2); }if (pid>0) //fork后对于父进程来说,返回的是子进程的进程号,即if语句内是父进程要执行的代码while (pid != wait(&i)) //等待子进程退出 /* nothing */;while (1) { //如果执行到了这里,就说明子进程已经创建完成退出或者终止,下面是再创建一个子进程,if ((pid=fork())<0) {printf("Fork failed in init\r\n");continue; }if (!pid) { //创建成功 close(0);close(1);close(2); setsid(); (void) open("/dev/tty0",O_RDWR,0); (void) dup(0); (void) dup(0); _exit(execve("/bin/sh",argv,envp)); }while (1)if (pid == wait(&i))break;printf("\n\rchild %d died with code %04x\n\r",pid,i); sync(); } _exit(0); /* NOTE! _exit, not exit() */}

① 0号进程打开标准输入输出错误句柄
② 创建1号进程,首先打开"/dev/rc"文件,执行shell
③ 如果1号进程创建失败,会换一种方式再次创建
④ 之后就是进行pause()暂停状态,系统等待运行下一步
####


2、普通进程的创建(WORKING)


众所周知每创建一个进程都会创建一个相对应的task_struct结构体,task结构体里就有代表该进程唯一的PID。


3、进程的调度与切换


这是Sched.c函数。
#include <linux/sched.h>#include <linux/kernel.h>#include <linux/sys.h>#include <linux/fdreg.h>#include <asm/system.h>#include <asm/io.h>#include <asm/segment.h>
#include <signal.h>
#define _S(nr) (1<<((nr)-1))#define _BLOCKABLE (~(_S(SIGKILL) | _S(SIGSTOP)))
void show_task(int nr,struct task_struct * p) //nr就是pid{int i,j = 4096-sizeof(struct task_struct);
printk("%d: pid=%d, state=%d, ",nr,p->pid,p->state); //打印pid与state i=0;while (i<j && !((char *)(p+1))[i]) i++; printk("%d (of %d) chars free in kernel stack\n\r",i,j); //打印栈}
void show_stat(void){int i;
for (i=0;i<NR_TASKS;i++)if (task[i]) show_task(i,task[i]);}
#define LATCH (1193180/HZ)
extern void mem_use(void);
extern int timer_interrupt(void);extern int system_call(void);
union task_union {struct task_struct task;char stack[PAGE_SIZE];};
static union task_union init_task = {INIT_TASK,};
long volatile jiffies=0;long startup_time=0;struct task_struct *current = &(init_task.task);struct task_struct *last_task_used_math = NULL;
struct task_struct * task[NR_TASKS] = {&(init_task.task), };
long user_stack [ PAGE_SIZE>>2 ] ;
struct {long * a;short b; } stack_start = { & user_stack [PAGE_SIZE>>2] , 0x10 };/* * 'math_state_restore()' saves the current math information in the * old math state array, and gets the new ones from the current task */void math_state_restore() //进程切换时完成协处理器中寄存器的切换{if (last_task_used_math == current)return; __asm__("fwait");if (last_task_used_math) { __asm__("fnsave %0"::"m" (last_task_used_math->tss.i387)); } last_task_used_math=current;if (current->used_math) { __asm__("frstor %0"::"m" (current->tss.i387)); } else { __asm__("fninit"::); current->used_math=1; }}
/* * 'schedule()' is the scheduler function. This is GOOD CODE! There * probably won't be any reason to change this, as it should work well * in all circumstances (ie gives IO-bound processes good response etc). * The one thing you might take a look at is the signal-handler code here. * * NOTE!! Task 0 is the 'idle' task, which gets called when no other * tasks can run. It can not be killed, and it cannot sleep. The 'state' * information in task[0] is never used. */void schedule(void){int i,next,c;struct task_struct ** p;
/* check alarm, wake up any interruptible tasks that have got a signal */
/*#define TASK_RUNNING 0 只有state是0时,该进程才会被运行,或进入就绪队列#define TASK_INTERRUPTIBLE 1 可中断睡眠状态 可以被信号中断,变成running状态#define TASK_UNINTERRUPTIBLE 2 不可中断睡眠状态 只能被wakeup函数唤醒,变成running状态#define TASK_ZOMBIE 3 僵死状态 进程停止运行,但是其task_struct未被清空#define TASK_STOPPED 4 暂停状态 */for(p = &LAST_TASK ; p > &FIRST_TASK ; --p) //从后往前遍历if (*p) { //若进程存在if ((*p)->alarm && (*p)->alarm < jiffies) { //若alarm不为空且小于jiffies(此处是0) (*p)->signal |= (1<<(SIGALRM-1)); (*p)->alarm = 0; }if (((*p)->signal & ~(_BLOCKABLE & (*p)->blocked)) && //进程不理会某些信号;并且进程是可中断睡眠状态 (*p)->state==TASK_INTERRUPTIBLE) (*p)->state=TASK_RUNNING; }
/* this is the scheduler proper: */
while (1) { //进行counter的比较,来决定进程的调用 c = -1; next = 0; i = NR_TASKS; p = &task[NR_TASKS];while (--i) {if (!*--p)continue;if ((*p)->state == TASK_RUNNING && (*p)->counter > c) c = (*p)->counter, next = i; //遍历之后,会将counter的最大值赋给c,并且next存着最大counter的pid }if (c) break;for(p = &LAST_TASK ; p > &FIRST_TASK ; --p)if (*p) (*p)->counter = ((*p)->counter >> 1) + //counter = counter/2 + priority (*p)->priority; } switch_to(next); //进程切换}/*这部分代码的目的是在所有就绪状态的任务进程中筛选出counter值最大的进程ID。之后如果counter值不为0则进入调度这个进程执行,如果counter值为0,则说明所有就绪状态的进程的时间片都已用完,需要重新调整所有进程的时间片。*/

/*#define switch_to(n) {\struct {long a,b;} __tmp; \__asm__("cmpl %%ecx,_current\n\t" \ "je 1f\n\t" \ "movw %%dx,%1\n\t" \ "xchgl %%ecx,_current\n\t" \ "ljmp %0\n\t" \ "cmpl %%ecx,_last_task_used_math\n\t" \ "jne 1f\n\t" \ "clts\n" \ "1:" \ ::"m" (*&__tmp.a),"m" (*&__tmp.b), \ "d" (_TSS(n)),"c" ((long) task[n])); \}*/




int sys_pause(void){ current->state = TASK_INTERRUPTIBLE; schedule();return 0;}
void sleep_on(struct task_struct **p) //当p进程想访问cpu的某个资源,但是该资源被占用;{struct task_struct *tmp;
if (!p)return;if (current == &(init_task.task)) //如果当前进程为0号进程时,就返回,不能sleep panic("task[0] trying to sleep"); tmp = *p; *p = current; //将p赋为当前进程 current->state = TASK_UNINTERRUPTIBLE; schedule();if (tmp) tmp->state=0;}
void interruptible_sleep_on(struct task_struct **p){struct task_struct *tmp;
if (!p)return;if (current == &(init_task.task)) panic("task[0] trying to sleep"); tmp=*p; *p=current;repeat: current->state = TASK_INTERRUPTIBLE; schedule();if (*p && *p != current) { (**p).state=0;goto repeat; } *p=NULL;if (tmp) tmp->state=0;}
void wake_up(struct task_struct **p){if (p && *p) { (**p).state=0; *p=NULL; }}
/* * OK, here are some floppy things that shouldn't be in the kernel * proper. They are here because the floppy needs a timer, and this * was the easiest way of doing it. */static struct task_struct * wait_motor[4] = {NULL,NULL,NULL,NULL};static int mon_timer[4]={0,0,0,0};static int moff_timer[4]={0,0,0,0};unsigned char current_DOR = 0x0C;
int ticks_to_floppy_on(unsigned int nr){extern unsigned char selected;unsigned char mask = 0x10 << nr;
if (nr>3) panic("floppy_on: nr>3"); moff_timer[nr]=10000; /* 100 s = very big :-) */ cli(); /* use floppy_off to turn it off */ mask |= current_DOR;if (!selected) { mask &= 0xFC; mask |= nr; }if (mask != current_DOR) { outb(mask,FD_DOR);if ((mask ^ current_DOR) & 0xf0) mon_timer[nr] = HZ/2;else if (mon_timer[nr] < 2) mon_timer[nr] = 2; current_DOR = mask; } sti();return mon_timer[nr];}
void floppy_on(unsigned int nr){ cli();while (ticks_to_floppy_on(nr)) sleep_on(nr+wait_motor); sti();}
void floppy_off(unsigned int nr){ moff_timer[nr]=3*HZ;}
void do_floppy_timer(void){int i;unsigned char mask = 0x10;
for (i=0 ; i<4 ; i++,mask <<= 1) {if (!(mask & current_DOR))continue;if (mon_timer[i]) {if (!--mon_timer[i]) wake_up(i+wait_motor); } else if (!moff_timer[i]) { current_DOR &= ~mask; outb(current_DOR,FD_DOR); } else moff_timer[i]--; }}
#define TIME_REQUESTS 64
static struct timer_list {long jiffies;void (*fn)();struct timer_list * next;} timer_list[TIME_REQUESTS], * next_timer = NULL;
void add_timer(long jiffies, void (*fn)(void)){struct timer_list * p;
if (!fn)return; cli();if (jiffies <= 0) (fn)();else {for (p = timer_list ; p < timer_list + TIME_REQUESTS ; p++)if (!p->fn)break;if (p >= timer_list + TIME_REQUESTS) panic("No more time requests free"); p->fn = fn; p->jiffies = jiffies; p->next = next_timer; next_timer = p;while (p->next && p->next->jiffies < p->jiffies) { p->jiffies -= p->next->jiffies; fn = p->fn; p->fn = p->next->fn; p->next->fn = fn; jiffies = p->jiffies; p->jiffies = p->next->jiffies; p->next->jiffies = jiffies; p = p->next; } } sti();}
void do_timer(long cpl){extern int beepcount;extern void sysbeepstop(void);
if (beepcount)if (!--beepcount) sysbeepstop();
if (cpl) current->utime++;else current->stime++;
if (next_timer) { next_timer->jiffies--;while (next_timer && next_timer->jiffies <= 0) {void (*fn)(void);
fn = next_timer->fn; next_timer->fn = NULL; next_timer = next_timer->next; (fn)(); } }if (current_DOR & 0xf0) do_floppy_timer();if ((--current->counter)>0) return; current->counter=0;if (!cpl) return; schedule();}
int sys_alarm(long seconds){int old = current->alarm;
if (old) old = (old - jiffies) / HZ; current->alarm = (seconds>0)?(jiffies+HZ*seconds):0;return (old);}
int sys_getpid(void){return current->pid;}
int sys_getppid(void){return current->father;}
int sys_getuid(void){return current->uid;}
int sys_geteuid(void){return current->euid;}
int sys_getgid(void){return current->gid;}
int sys_getegid(void){return current->egid;}
int sys_nice(long increment){if (current->priority-increment>0) current->priority -= increment;return 0;}
void sched_init(void){int i;struct desc_struct * p;
if (sizeof(struct sigaction) != 16) panic("Struct sigaction MUST be 16 bytes"); set_tss_desc(gdt+FIRST_TSS_ENTRY,&(init_task.task.tss)); set_ldt_desc(gdt+FIRST_LDT_ENTRY,&(init_task.task.ldt)); p = gdt+2+FIRST_TSS_ENTRY;for(i=1;i<NR_TASKS;i++) { task[i] = NULL; p->a=p->b=0; p++; p->a=p->b=0; p++; }/* Clear NT, so that we won't have troubles with that later on */ __asm__("pushfl ; andl $0xffffbfff,(%esp) ; popfl"); ltr(0); lldt(0); outb_p(0x36,0x43); /* binary, mode 3, LSB/MSB, ch 0 */ outb_p(LATCH & 0xff , 0x40); /* LSB */ outb(LATCH >> 8 , 0x40); /* MSB */ set_intr_gate(0x20,&timer_interrupt); outb(inb_p(0x21)&~0x01,0x21); set_system_gate(0x80,&system_call);}


4、进程的销毁


就是exit.c函数的操作。


5、进程间的通信(WORKING)


(1)进程,线程

创建一个进程之后,就会对应一个task_struct结构体,fork之后,会进行写实复制(Copy-On-Write),也就是说子进程和父进程的内容大部分是一致的。

问:一个进程多个线程的调度方式和一个进程一个线程时的调度方式有什么区别?

答:没有区别,内核中线程和进程都需要do_fork来实现,所以没有区别。





操作系统的引导与启动


1、BIOS/Bootloader:


由PC机的BIOS(0xFFFF0是BIOS存储的总线地址)把bootsect从某个固定的地址拿到了内存中的某个固定地址(0x90000),并且进行了一系列的硬件初始化和参数设置。


2、bootsect.s(WORKING)


磁盘引导块程序,在磁盘的第一个扇区中的程序(0磁道,0磁头,1扇区)。

作用:首先将后续的setup.s代码从磁盘中加载到紧接着bootsect.s的地方,在显示屏上显示loading system ,再将操作系统加载到0x10000,最后转到setup.s运行。
! SYS_SIZE is the number of clicks (16 bytes) to be loaded.! 0x3000 is 0x30000 bytes = 196kB, more than enough for current! versions of linux!SYSSIZE = 0x3000!! bootsect.s (C) 1991 Linus Torvalds!! bootsect.s is loaded at 0x7c00 by the bios-startup routines, and moves! iself out of the way to address 0x90000, and jumps there.!! It then loads 'setup' directly after itself (0x90200), and the system! at 0x10000, using BIOS interrupts.!! NOTE! currently system is at most 8*65536 bytes long. This should be no! problem, even in the future. I want to keep it simple. This 512 kB! kernel size should be enough, especially as this doesn't contain the! buffer cache as in minix!! The loader has been made as simple as possible, and continuos! read errors will result in a unbreakable loop. Reboot by hand. It! loads pretty fast by getting whole sectors at a time whenever possible.
.globl begtext, begdata, begbss, endtext, enddata, endbss.textbegtext:.databegdata:.bssbegbss:.text
SETUPLEN = 4 ! nr of setup-sectorsBOOTSEG = 0x07c0 ! original address of boot-sectorINITSEG = 0x9000 ! we move boot here - out of the waySETUPSEG = 0x9020 ! setup starts hereSYSSEG = 0x1000 ! system loaded at 0x10000 (65536).ENDSEG = SYSSEG + SYSSIZE ! where to stop loading
! ROOT_DEV: 0x000 - same type of floppy as boot.! 0x301 - first partition on first drive etcROOT_DEV = 0x306
entry startstart:mov ax,#BOOTSEGmov ds,axmov ax,#INITSEGmov es,axmov cx,#256sub si,sisub di,direpmovwjmpi go,INITSEGgo: mov ax,csmov ds,axmov es,ax! put stack at 0x9ff00.mov ss,axmov sp,#0xFF00 ! arbitrary value >>512
! load the setup-sectors directly after the bootblock.! Note that 'es' is already set up.
load_setup:mov dx,#0x0000 ! drive 0, head 0mov cx,#0x0002 ! sector 2, track 0mov bx,#0x0200 ! address = 512, in INITSEGmov ax,#0x0200+SETUPLEN ! service 2, nr of sectorsint 0x13 ! read itjnc ok_load_setup ! ok - continuemov dx,#0x0000mov ax,#0x0000 ! reset the disketteint 0x13j load_setup
ok_load_setup:
! Get disk drive parameters, specifically nr of sectors/track
mov dl,#0x00mov ax,#0x0800 ! AH=8 is get drive parametersint 0x13mov ch,#0x00seg csmov sectors,cxmov ax,#INITSEGmov es,ax
! Print some inane message
mov ah,#0x03 ! read cursor posxor bh,bhint 0x10
mov cx,#24mov bx,#0x0007 ! page 0, attribute 7 (normal)mov bp,#msg1mov ax,#0x1301 ! write string, move cursorint 0x10
! ok, we've written the message, now! we want to load the system (at 0x10000)
mov ax,#SYSSEGmov es,ax ! segment of 0x010000call read_itcall kill_motor
! After that we check which root-device to use. If the device is! defined (!= 0), nothing is done and the given device is used.! Otherwise, either /dev/PS0 (2,28) or /dev/at0 (2,8), depending! on the number of sectors that the BIOS reports currently.
seg csmov ax,root_devcmp ax,#0jne root_definedseg csmov bx,sectorsmov ax,#0x0208 ! /dev/ps0 - 1.2Mbcmp bx,#15je root_definedmov ax,#0x021c ! /dev/PS0 - 1.44Mbcmp bx,#18je root_definedundef_root:jmp undef_rootroot_defined:seg csmov root_dev,ax
! after that (everyting loaded), we jump to! the setup-routine loaded directly after! the bootblock:
jmpi 0,SETUPSEG
! This routine loads the system at address 0x10000, making sure! no 64kB boundaries are crossed. We try to load it as fast as! possible, loading whole tracks whenever we can.!! in: es - starting address segment (normally 0x1000)!sread: .word 1+SETUPLEN ! sectors read of current trackhead: .word 0 ! current headtrack: .word 0 ! current track
read_it:mov ax,estest ax,#0x0fffdie: jne die ! es must be at 64kB boundaryxor bx,bx ! bx is starting address within segmentrp_read:mov ax,escmp ax,#ENDSEG ! have we loaded all yet?jb ok1_readretok1_read:seg csmov ax,sectorssub ax,sreadmov cx,axshl cx,#9add cx,bxjnc ok2_readje ok2_readxor ax,axsub ax,bxshr ax,#9ok2_read:call read_trackmov cx,axadd ax,sreadseg cscmp ax,sectorsjne ok3_readmov ax,#1sub ax,headjne ok4_readinc trackok4_read:mov head,axxor ax,axok3_read:mov sread,axshl cx,#9add bx,cxjnc rp_readmov ax,esadd ax,#0x1000mov es,axxor bx,bxjmp rp_read
read_track:push axpush bxpush cxpush dxmov dx,trackmov cx,sreadinc cxmov ch,dlmov dx,headmov dh,dlmov dl,#0and dx,#0x0100mov ah,#2int 0x13jc bad_rtpop dxpop cxpop bxpop axretbad_rt: mov ax,#0mov dx,#0int 0x13pop dxpop cxpop bxpop axjmp read_track
/** This procedure turns off the floppy drive motor, so* that we enter the kernel in a known state, and* don't have to worry about it later.*/kill_motor:push dxmov dx,#0x3f2mov al,#0outbpop dxret
sectors:.word 0
msg1:.byte 13,10.ascii "Loading system ...".byte 13,10,13,10
.org 508root_dev:.word ROOT_DEVboot_flag:.word 0xAA55
.textendtext:.dataenddata:.bssendbss:


3、setup.s(WORKING)


解析BIOS/Bootloader传进来的参数,设置系统内核运行的LDT(局部描述符),IDT(中断描述符) GDT(全局描述符),设置中断控制芯片,进入保护模式运行;跳转到head.s运行。
setup.s (C) 1991 Linus Torvalds!! setup.s is responsible for getting the system data from the BIOS,! and putting them into the appropriate places in system memory.! both setup.s and system has been loaded by the bootblock.!! This code asks the bios for memory/disk/other parameters, and! puts them in a "safe" place: 0x90000-0x901FF, ie where the! boot-block used to be. It is then up to the protected mode! system to read them from there before the area is overwritten! for buffer-blocks.!
! NOTE! These had better be the same as in bootsect.s!
INITSEG = 0x9000 ! we move boot here - out of the waySYSSEG = 0x1000 ! system loaded at 0x10000 (65536).SETUPSEG = 0x9020 ! this is the current segment
.globl begtext, begdata, begbss, endtext, enddata, endbss.textbegtext:.databegdata:.bssbegbss:.text
entry startstart:
! ok, the read went well so we get current cursor position and save it for! posterity.
mov ax,#INITSEG ! this is done in bootsect already, but...mov ds,axmov ah,#0x03 ! read cursor posxor bh,bhint 0x10 ! save it in known place, con_init fetchesmov [0],dx ! it from 0x90000.
! Get memory size (extended mem, kB)
mov ah,#0x88int 0x15mov [2],ax
! Get video-card data:
mov ah,#0x0fint 0x10mov [4],bx ! bh = display pagemov [6],ax ! al = video mode, ah = window width
! check for EGA/VGA and some config parameters
mov ah,#0x12mov bl,#0x10int 0x10mov [8],axmov [10],bxmov [12],cx
! Get hd0 data
mov ax,#0x0000mov ds,axlds si,[4*0x41]mov ax,#INITSEGmov es,axmov di,#0x0080mov cx,#0x10repmovsb
! Get hd1 data
mov ax,#0x0000mov ds,axlds si,[4*0x46]mov ax,#INITSEGmov es,axmov di,#0x0090mov cx,#0x10repmovsb
! Check that there IS a hd1 :-)
mov ax,#0x01500mov dl,#0x81int 0x13jc no_disk1cmp ah,#3je is_disk1no_disk1:mov ax,#INITSEGmov es,axmov di,#0x0090mov cx,#0x10mov ax,#0x00repstosbis_disk1:
! now we want to move to protected mode ...
cli ! no interrupts allowed !
! first we move the system to it's rightful place
mov ax,#0x0000cld ! 'direction'=0, movs moves forwarddo_move:mov es,ax ! destination segmentadd ax,#0x1000cmp ax,#0x9000jz end_movemov ds,ax ! source segmentsub di,disub si,simov cx,#0x8000repmovswjmp do_move
! then we load the segment descriptors
end_move:mov ax,#SETUPSEG ! right, forgot this at first. didn't work :-)mov ds,axlidt idt_48 ! load idt with 0,0lgdt gdt_48 ! load gdt with whatever appropriate
! that was painless, now we enable A20
call empty_8042mov al,#0xD1 ! command writeout #0x64,alcall empty_8042mov al,#0xDF ! A20 onout #0x60,alcall empty_8042
! well, that went ok, I hope. Now we have to reprogram the interrupts :-(! we put them right after the intel-reserved hardware interrupts, at! int 0x20-0x2F. There they won't mess up anything. Sadly IBM really! messed this up with the original PC, and they haven't been able to! rectify it afterwards. Thus the bios puts interrupts at 0x08-0x0f,! which is used for the internal hardware interrupts as well. We just! have to reprogram the 8259's, and it isn't fun.
mov al,#0x11 ! initialization sequenceout #0x20,al ! send it to 8259A-1.word 0x00eb,0x00eb ! jmp $+2, jmp $+2out #0xA0,al ! and to 8259A-2.word 0x00eb,0x00ebmov al,#0x20 ! start of hardware int's (0x20)out #0x21,al.word 0x00eb,0x00ebmov al,#0x28 ! start of hardware int's 2 (0x28)out #0xA1,al.word 0x00eb,0x00ebmov al,#0x04 ! 8259-1 is masterout #0x21,al.word 0x00eb,0x00ebmov al,#0x02 ! 8259-2 is slaveout #0xA1,al.word 0x00eb,0x00ebmov al,#0x01 ! 8086 mode for bothout #0x21,al.word 0x00eb,0x00ebout #0xA1,al.word 0x00eb,0x00ebmov al,#0xFF ! mask off all interrupts for nowout #0x21,al.word 0x00eb,0x00ebout #0xA1,al
! well, that certainly wasn't fun :-(. Hopefully it works, and we don't! need no steenking BIOS anyway (except for the initial loading :-).! The BIOS-routine wants lots of unnecessary data, and it's less! "interesting" anyway. This is how REAL programmers do it.!! Well, now's the time to actually move into protected mode. To make! things as simple as possible, we do no register set-up or anything,! we let the gnu-compiled 32-bit programs do that. We just jump to! absolute address 0x00000, in 32-bit protected mode.
mov ax,#0x0001 ! protected mode (PE) bitlmsw ax ! This is it!jmpi 0,8 ! jmp offset 0 of segment 8 (cs)
! This routine checks that the keyboard command queue is empty! No timeout is used - if this hangs there is something wrong with! the machine, and we probably couldn't proceed anyway.empty_8042:.word 0x00eb,0x00ebin al,#0x64 ! 8042 status porttest al,#2 ! is input buffer full?jnz empty_8042 ! yes - loopret
gdt:.word 0,0,0,0 ! dummy
.word 0x07FF ! 8Mb - limit=2047 (2048*4096=8Mb).word 0x0000 ! base address=0.word 0x9A00 ! code read/exec.word 0x00C0 ! granularity=4096, 386
.word 0x07FF ! 8Mb - limit=2047 (2048*4096=8Mb).word 0x0000 ! base address=0.word 0x9200 ! data read/write.word 0x00C0 ! granularity=4096, 386
idt_48:.word 0 ! idt limit=0.word 0,0 ! idt base=0L
gdt_48:.word 0x800 ! gdt limit=2048, 256 GDT entries.word 512+gdt,0x9 ! gdt base = 0X9xxxx
.textendtext:.dataenddata:.bssendbss:


注:GDT,LDT,IDT表是什么?


GDT(global descriptor table),全局段描述符表,这些64kb数据整齐的排列在内存中某一位置。而该位置的内存地址以及有效的个数就存放在GDTR中,GDTR是特殊的寄存器。GDT在系统内只存在一个。

LDT(local descripotr table),局部段描述符表,LDT在系统内可存在多个,每个任务最多只能拥有一个LDT,另外,每一个LDT自身作为一个段存在,它们的段描述符被放在GDT中。

IDT(interrupt descriptor table),中断描述符表,IDT记录了0~255的中断号码和中断服务函数的关系。当发生中断的时候,通过中断号码去执行中断服务函数。

GDT可以被放在内存的任何位置,那么当程序员通过段寄存器来引用一个段描述符时,CPU必须知道GDT的入口,也就是基地址放在哪里,所以Intel的设计者门提供了一个寄存器GDTR用来存放GDT的入口地址,程序员将GDT设定在内存中某个位置之后,可以通过LGDT指令将GDT的入口地址装入此寄存器,从此以后,CPU就根据此寄存器中的内容作为GDT的入口来访问GDT了。

IA-32为LDT的入口地址也提供了一个寄存器LDTR,因为在任何时刻只能有一个任务在运行,所以LDT寄存器全局也只需要有一个。如果一个任务拥有自身的LDT,那么当它需要引用自身的LDT时,它需要通过LLDT指令将其LDT的段描述符装入此寄存器。LLDT指令与LGDT指令不同的时,LGDT指令的操作数是一个32-bit的内存地址,这个内存地址处存放的是一个32-bit GDT的入口地址,以及16-bit的GDT Limit。而LLDT指令的操作数是一个16-bit的选择子,这个选择子主要内容是:被装入的LDT的段描述符在GDT中的索引值。


4、head.s(WORKING)


加载内核运行时的各数据段寄存器,重新设置中断描述符表,开启内核正常运行时的协处理器等资源;设置内存管理的分页机制,跳转到main.c运行。
* head.s contains the 32-bit startup code. * * NOTE!!! Startup happens at absolute address 0x00000000, which is also where * the page directory will exist. The startup code will be overwritten by * the page directory. */.text.globl _idt,_gdt,_pg_dir,_tmp_floppy_area_pg_dir:startup_32: movl $0x10,%eax mov %ax,%ds mov %ax,%es mov %ax,%fs mov %ax,%gs //上面是重新加载寄存器 lss _stack_start,%esp //lss _stack_start,%esp是将结构体 stact_start 的值传送到ss:esp,即令 ss=0x10(段选择子)和 esp=& user_stack [PAGE_SIZE>>2] call setup_idt //设置idt和gdt call setup_gdt movl $0x10,%eax # reload all the segment registers mov %ax,%ds # after changing gdt. CS was already mov %ax,%es # reloaded in 'setup_gdt' mov %ax,%fs mov %ax,%gs lss _stack_start,%esp xorl %eax,%eax1: incl %eax # check that A20 really IS enabled movl %eax,0x000000 # loop forever if it isn't cmpl %eax,0x100000 je 1b/* * NOTE! 486 should set bit 16, to check for write-protect in supervisor * mode. Then it would be unnecessary with the "verify_area()"-calls. * 486 users probably want to set the NE (#5) bit also, so as to use * int 16 for math errors. */ movl %cr0,%eax # check math chip andl $0x80000011,%eax # Save PG,PE,ET/* "orl $0x10020,%eax" here for 486 might be good */ orl $2,%eax # set MP movl %eax,%cr0 call check_x87 jmp after_page_tables
/* * We depend on ET to be correct. This checks for 287/387. */check_x87: fninit fstsw %ax cmpb $0,%al je 1f /* no coprocessor: have to set bits */ movl %cr0,%eax xorl $6,%eax /* reset MP, set EM */ movl %eax,%cr0 ret.align 21: .byte 0xDB,0xE4 /* fsetpm for 287, ignored by 387 */ ret
/* * setup_idt * * sets up a idt with 256 entries pointing to * ignore_int, interrupt gates. It then loads * idt. Everything that wants to install itself * in the idt-table may do so themselves. Interrupts * are enabled elsewhere, when we can be relatively * sure everything is ok. This routine will be over- * written by the page tables. */setup_idt: lea ignore_int,%edx //将ignore_int的有效地址存到edx movl $0x00080000,%eax //将0x8000放入eax的高16位 movw %dx,%ax /* selector = 0x0008 = cs 将ignore_int有效地址存到eax低16字节 */ movw $0x8E00,%dx /* interrupt gate - dpl=0, present */
lea _idt,%edi // mov $256,%ecxrp_sidt: movl %eax,(%edi) movl %edx,4(%edi) addl $8,%edi dec %ecx jne rp_sidt lidt idt_descr ret
/* * setup_gdt * * This routines sets up a new gdt and loads it. * Only two entries are currently built, the same * ones that were built in init.s. The routine * is VERY complicated at two whole lines, so this * rather long comment is certainly needed :-). * This routine will beoverwritten by the page tables. */setup_gdt: lgdt gdt_descr ret
/* * I put the kernel page tables right after the page directory, * using 4 of them to span 16 Mb of physical memory. People with * more than 16MB will have to expand this. */.org 0x1000pg0:
.org 0x2000pg1:
.org 0x3000pg2:
.org 0x4000pg3:
.org 0x5000/* * tmp_floppy_area is used by the floppy-driver when DMA cannot * reach to a buffer-block. It needs to be aligned, so that it isn't * on a 64kB border. */_tmp_floppy_area: .fill 1024,1,0
after_page_tables: pushl $0 # These are the parameters to main :-) pushl $0 pushl $0 pushl $L6 # return address for main, if it decides to. pushl $_main jmp setup_pagingL6: jmp L6 # main should never return here, but# just in case, we know what happens.
/* This is the default interrupt "handler" :-) */int_msg: .asciz "Unknown interrupt\n\r".align 2ignore_int: pushl %eax pushl %ecx pushl %edx push %ds push %es push %fs movl $0x10,%eax mov %ax,%ds mov %ax,%es mov %ax,%fs pushl $int_msg call _printk popl %eax pop %fs pop %es pop %ds popl %edx popl %ecx popl %eax iret

/* * Setup_paging * * This routine sets up paging by setting the page bit * in cr0. The page tables are set up, identity-mapping * the first 16MB. The pager assumes that no illegal * addresses are produced (ie >4Mb on a 4Mb machine). * * NOTE! Although all physical memory should be identity * mapped by this routine, only the kernel page functions * use the >1Mb addresses directly. All "normal" functions * use just the lower 1Mb, or the local data space, which * will be mapped to some other place - mm keeps track of * that. * * For those with more memory than 16 Mb - tough luck. I've * not got it, why should you :-) The source is here. Change * it. (Seriously - it shouldn't be too difficult. Mostly * change some constants etc. I left it at 16Mb, as my machine * even cannot be extended past that (ok, but it was cheap :-) * I've tried to show which constants to change by having * some kind of marker at them (search for "16Mb"), but I * won't guarantee that's all :-( ) */.align 2setup_paging: movl $1024*5,%ecx /* 5 pages - pg_dir+4 page tables */ xorl %eax,%eax xorl %edi,%edi /* pg_dir is at 0x000 */ cld;rep;stosl movl $pg0+7,_pg_dir /* set present bit/user r/w */ movl $pg1+7,_pg_dir+4 /* --------- " " --------- */ movl $pg2+7,_pg_dir+8 /* --------- " " --------- */ movl $pg3+7,_pg_dir+12 /* --------- " " --------- */ movl $pg3+4092,%edi movl $0xfff007,%eax /* 16Mb - 4096 + 7 (r/w user,p) */ std1: stosl /* fill pages backwards - more efficient :-) */ subl $0x1000,%eax jge 1b xorl %eax,%eax /* pg_dir is at 0x0000 */ movl %eax,%cr3 /* cr3 - page directory start */ movl %cr0,%eax orl $0x80000000,%eax movl %eax,%cr0 /* set paging (PG) bit */ ret /* this also flushes prefetch-queue */
.align 2.word 0idt_descr: .word 256*8-1 # idt contains 256 entries .long _idt.align 2.word 0gdt_descr: .word 256*8-1 # so does gdt (not that that's any .long _gdt # magic number, but it works for me :^)
.align 3_idt: .fill 256,8,0 # idt is uninitialized
_gdt: .quad 0x0000000000000000 /* NULL descriptor */ .quad 0x00c09a0000000fff /* 16Mb */ .quad 0x00c0920000000fff /* 16Mb */ .quad 0x0000000000000000 /* TEMPORARY - don't use */ .fill 252,8,0


5、main.c(WORKING)

void main(void) /* This really IS void, no error here. */{ /* The startup routine assumes (well, ...) this *//* * Interrupts are still disabled. Do necessary setups, then * enable them */ ROOT_DEV = ORIG_ROOT_DEV; drive_info = DRIVE_INFO; memory_end = (1<<20) + (EXT_MEM_K<<10); memory_end &= 0xfffff000;if (memory_end > 16*1024*1024) memory_end = 16*1024*1024;if (memory_end > 12*1024*1024) buffer_memory_end = 4*1024*1024;else if (memory_end > 6*1024*1024) buffer_memory_end = 2*1024*1024;else buffer_memory_end = 1*1024*1024; main_memory_start = buffer_memory_end;#ifdef RAMDISK main_memory_start += rd_init(main_memory_start, RAMDISK*1024);#endif mem_init(main_memory_start,memory_end); trap_init(); blk_dev_init(); chr_dev_init(); tty_init(); time_init(); sched_init(); buffer_init(buffer_memory_end); hd_init(); floppy_init(); sti(); move_to_user_mode();if (!fork()) { /* we count on this going ok */ init(); //init函数在三.1有分析 }/* * NOTE!! For any other task 'pause()' would mean we have to get a * signal to awaken, but task0 is the sole exception (see 'schedule()') * as task 0 gets activated at every idle moment (when no other tasks * can run). For task0 'pause()' just means we go check if some other * task can run, and if not we return here. */for(;;) pause();}





信号概述


内核的信号量是很重要的,关于信号的定义在/include/signal.h文件内,比如运行一个elf文件可能会出现段错误(SIGSEGV),玩pwn的同学应该很熟悉。在system_call.s中存在call do_signal,那么do_signal在/kernel/signal.c内定义。

硬件来源:信号由硬件驱动产生
软件来源:系统提供了些API,例如kill命令
当进程收到信号时,会有三种场景;
忽略:忽略信号
执行:执行每个信号所对应的操作
执行自定操作:用户自定义的操作
① 在系统中什么是信号,都有什么信号?
② 在系统接收到信号后,是如何进行处理的?
③ 信号作用。


1、signal.h

#ifndef _SIGNAL_H#define _SIGNAL_H
#include <sys/types.h>
typedef int sig_atomic_t;typedef unsigned int sigset_t; /* 32 bits */
#define _NSIG 32#define NSIG _NSIG
#define SIGHUP 1 //挂断控制中端或进程#define SIGINT 2 //键盘中断#define SIGQUIT 3 //键盘退出#define SIGILL 4 //非法指令#define SIGTRAP 5 //跟踪断点#define SIGABRT 6 //异常结束#define SIGIOT 6 //异常结束#define SIGUNUSED 7 //未使用#define SIGFPE 8 //协处理器错误#define SIGKILL 9 //终止进程#define SIGUSR1 10 //用户信号1#define SIGSEGV 11 //段错误#define SIGUSR2 12 //用户信号2#define SIGPIPE 13 //管道写出错,读端全关闭#define SIGALRM 14 //定时器警报#define SIGTERM 15 //进程终止#define SIGSTKFLT 16 //栈出错#define SIGCHLD 17 //子进程状态改变#define SIGCONT 18 //恢复进程继续执行#define SIGSTOP 19 //暂停进程执行#define SIGTSTP 20 //tty发出停止信号#define SIGTTIN 21 //后台进程请求输入#define SIGTTOU 22 //后台进程请求输出
/* Ok, I haven't implemented sigactions, but trying to keep headers POSIX */#define SA_NOCLDSTOP 1 //当子进程处于停止状态,就不对SIGCHLD处理#define SA_NOMASK 0x40000000 //不阻止在指定的信号处理程序中再收到该信号#define SA_ONESHOT 0x80000000 //信号句柄一旦被处理过就恢复到默认处理句柄
#define SIG_BLOCK 0 /* for blocking signals */#define SIG_UNBLOCK 1 /* for unblocking signals */#define SIG_SETMASK 2 /* for setting the signal mask */
#define SIG_DFL ((void (*)(int))0) /* default signal handling */ //默认处理信号句柄#define SIG_IGN ((void (*)(int))1) /* ignore signal */ //忽略信号的处理程序
struct sigaction { //信号结构体void (*sa_handler)(int); //对应某信号指定要采取的行动,可以用上面的SIG_DFL和SIG_IGNsigset_t sa_mask; //当前信号处理程序执行期间需要被屏蔽的信号int sa_flags; //void (*sa_restorer)(void); //恢复函数指针};
void (*signal(int _sig, void (*_func)(int)))(int);int raise(int sig);int kill(pid_t pid, int sig);int sigaddset(sigset_t *mask, int signo);int sigdelset(sigset_t *mask, int signo);int sigemptyset(sigset_t *mask);int sigfillset(sigset_t *mask);int sigismember(sigset_t *mask, int signo); /* 1 - is, 0 - not, -1 error */int sigpending(sigset_t *set);int sigprocmask(int how, sigset_t *set, sigset_t *oldset);int sigsuspend(sigset_t *sigmask);int sigaction(int sig, struct sigaction *act, struct sigaction *oldact);
#endif /* _SIGNAL_H */


2、signal.c

#include <linux/sched.h>#include <linux/kernel.h>#include <asm/segment.h>
#include <signal.h>
volatile void do_exit(int error_code);
int sys_sgetmask(){return current->blocked;}
int sys_ssetmask(int newmask){int old=current->blocked;
current->blocked = newmask & ~(1<<(SIGKILL-1));return old;}
static inline void save_old(char * from,char * to){int i;
verify_area(to, sizeof(struct sigaction));for (i=0 ; i< sizeof(struct sigaction) ; i++) { put_fs_byte(*from,to); from++; to++; }}
static inline void get_new(char * from,char * to){int i;
for (i=0 ; i< sizeof(struct sigaction) ; i++) *(to++) = get_fs_byte(from++);}
int sys_signal(int signum, long handler, long restorer) //signum是信号标号,handlers是信号处理的函数指针,restorer是恢复函数指针,即执行完signal系统调用后,恢复堆栈及返回值{struct sigaction tmp;
if (signum<1 || signum>32 || signum==SIGKILL)return -1; tmp.sa_handler = (void (*)(int)) handler; //设置结构体 tmp.sa_mask = 0; tmp.sa_flags = SA_ONESHOT | SA_NOMASK; tmp.sa_restorer = (void (*)(void)) restorer; handler = (long) current->sigaction[signum-1].sa_handler; current->sigaction[signum-1] = tmp;return handler;}
int sys_sigaction(int signum, const struct sigaction * action, struct sigaction * oldaction) //设置新信号处理结构体{struct sigaction tmp;
if (signum<1 || signum>32 || signum==SIGKILL) //若不符合信号值大小,直接返回return -1; tmp = current->sigaction[signum-1]; //信号值所对应的sigaction结构体 get_new((char *) action, (char *) (signum-1+current->sigaction)); //设置新信号处理结构体if (oldaction) save_old((char *) &tmp,(char *) oldaction); //将old保存到tmpif (current->sigaction[signum-1].sa_flags & SA_NOMASK) //如果允许处理信号过程中再次收到该信号,则屏蔽码置为0 current->sigaction[signum-1].sa_mask = 0;else //否则,设置屏蔽本信号 current->sigaction[signum-1].sa_mask |= (1<<(signum-1));return 0;}
void do_signal(long signr,long eax, long ebx, long ecx, long edx,long fs, long es, long ds,long eip, long cs, long eflags,unsigned long * esp, long ss) //signr是信号值,其余都是当前寄存器为参数{unsigned long sa_handler; long old_eip=eip; //将用户态ip保存至old_eipstruct sigaction * sa = current->sigaction + signr - 1; //取出当前任务signr信号量所对应的sigaction结构体存入saint longs;unsigned long * tmp_esp;
sa_handler = (unsigned long) sa->sa_handler; //取出信号处理函数指针if (sa_handler==1) //若sa_handler是SIG_IGN,直接返回return;if (!sa_handler) { ///如果信号处理函数是 SIG_DFL,表示按默认方式处理if (signr==SIGCHLD) //不作处理,直接返回return;else do_exit(1<<(signr-1)); //否则终止进程,故默认处理方式一般效果是终止进程 }if (sa->sa_flags & SA_ONESHOT) //如果只需调用一次信号处理,则将sa_handler置零 sa->sa_handler = NULL; *(&eip) = sa_handler; //将用户返回地址换成信号处理函数 longs = (sa->sa_flags & SA_NOMASK)?7:8; //如果允许处理信号过程中再次收到该信号,longs 为 7,否则为 8 *(&esp) -= longs; //将用户栈腾出空间存放寄存器 verify_area(esp,longs*4); tmp_esp=esp; //保存腾出空间之后的esp put_fs_long((long) sa->sa_restorer,tmp_esp++); //存入恢复栈函数地址 put_fs_long(signr,tmp_esp++); //if (!(sa->sa_flags & SA_NOMASK)) put_fs_long(current->blocked,tmp_esp++); put_fs_long(eax,tmp_esp++); //下面的操作是将各种寄存器压入用户栈 put_fs_long(ecx,tmp_esp++); put_fs_long(edx,tmp_esp++); put_fs_long(eflags,tmp_esp++); put_fs_long(old_eip,tmp_esp++); current->blocked |= sa->sa_mask; //}


3、sa_restorer

/* 如果没有屏蔽码,使用该函数作为恢复函数 */sig_restore:addl $4,%esp /* 丢弃 signr */popl %eax /* 系统调用返回值还原到 eax */popl %ecx /* 还原 ecx,edx */popl %edxpopfl /* 恢复 eflags */ret
/* 如果有屏蔽码,使用该函数 */masksig_restore:addl $4,%espcall ssetmask /* 设置信号屏蔽码 */addl $4,%esp /* 丢弃屏蔽码 */popl %eaxpopl %ecxpopl %edxpopflret





文件系统


顾名思义就是文件所组成的一个系统,linux下所谓“一切皆文件”,所以文件系统在内核中占了很大比重。

Linux启动过程:
① PCB上电后先由uboot初始化板子,然后将linux内核迁移到内存中运行;
② 由linux内核进行初始化操作,挂载第一个应用程序即根文件系统(linuxrc);
③ 根文件系统提供磁盘管理服务(glibc,设备节点,配置文件,应用程序 shell命令)。

1、文件系统概述


文件系统主要包括四个部分:高速缓冲区管理,文件底层操作,文件数据访问,文件高层访问控制。


(1)文件系统底层函数


① bitmap.c

程序包括对i节点位图和逻辑块位图进行释放和占用处理函数。操作i节点位图的函数是free_inode()和new_inode(),操作逻辑块位图的函数是free_block()和new_block()。


② truncate.c

程序包括对数据文件长度截断为0的函数truncate(),他将i节点指定的设备上文件长度截为0,并释放文件数据占用的设备逻辑块。


③ inode.c

程序包括分配i节点函数iget()和放回对内存i节点存取函数iput()以及根据i节点信息取文件数据块在设备上对应的逻辑块号函数bmap()。


④ namei.c

程序主要包括函数namei(),该函数使用iget(),iput(),bmap()将给定的文件路径名映射到其i节点。


⑤ super.c

程序专门用于处理文件系统超级块,包括函数get_super(),put_super()和free_super()和free_super()等,还包括几个文件系统加载/卸载处理函数和系统调用,如sys_mount()等。


(2)文件中数据的访问操作


① block_dev.c

程序中的函数block_read()和block_write()是用于读写块设备特殊文件的数据,所使用的参数指定要访问的设备号,起始地址和长度


② file_dev.c

程序中的file_read()和file_write()函数是用于访问一般的文件,所使用的参数指定文件对应的i节点和文件结构。


③ pipe.c

文件中实现了管道读写函数read_pipe()和write_pipe(),另外还实现了创建无名管道的系统调用pipe(),


④ char_dev.c

系统调用使用read()和write()会调用char_dev.c中的rw_char()函数来操作。字符设备包括控制台终端,串口终端和内存字符设备。


(3)文件和目录管理系统调用


① open.c

文件用于实现与文件操作相关的系统调用,主要有文件的创建,打开和关闭,文件宿主和属性修改,文件访问权限和操作时间的修改等。


② exec.c

程序实现对二进制可执行文件和shell脚本文件的加载与执行,其中主要是的do_execve(),他是系统中断调用(int 0x80)的功能号__NR_execve()调用的C处理函数,更是exec()函数簇的主要实现函数。


③ fcntl.c

实现了文件控制操作的系统调用fcntl()和两个文件句柄(描述符)复制系统调用dup()和dup2(),dup2()指定了新句柄的数值,dup()则返回当前最小值的未用句柄。句柄复制操作主要用在文件的标准输入/输出重定向和管道操作方面。


④ ioctl.c

文件实现了输入/输出控制系统调用ioctl(),主要调用tty_ioctl()函数,对终端的I/O进行控制。


⑤ stat.c

文件用于实现取得文件状态信息的系统调用,stat()和fstat()。stat()是利用文件名取信息,而fstat()是利用文件句柄取信息。


2、高速缓冲区管理(buffer.c)


高速缓冲区位于内核代码与主内存区之间,在块设备与内核其他程序之间起着一个桥梁作用,除了块设备驱动程序以外,内核程序如果需要访问块设备中的数据,就需要通过高速缓冲区来进行操作。

END

官方站点:www.linuxprobe.com

Linux命令大全:www.linuxcool.com

刘遄老师QQ:5604241

Linux技术交流群:3762708

(新群,火热加群中……)

想要学习Linux系统的读者可以点击"阅读原文"按钮来了解书籍《Linux就该这么学》,同时也非常适合专业的运维人员阅读,成为辅助您工作的高价值工具书!


微信扫码关注该文公众号作者

戳这里提交新闻线索和高质量文章给我们。
相关阅读
华为开发者贡献 Linux 内核补丁,将核心内核函数速度提升 715 倍学习笔记 | 二十届二中全会,新提法新部署新要求活着时 为自己准备了后事清平乐 - 一张白纸《共产党下台》谁准备好了上台封在养老院里的老爹(学习笔记)大兴调查研究工作方案全新打开方式!两会学习笔记→Linux 内核 6.1 发布,包含初始 Rust 支持 | Linux 中国学习笔记:有关“斗争”美国护士学习笔记火遍全网,卖笔记怒赚200万美元?!Linux 只是一个内核:这是什么意思? | Linux 中国学习笔记:述职报告撰写体会及其他2023政府工作报告要点:学习笔记(划重点)学习笔记:中国式现代化建设你现在可以在 Arch Linux 上安装 Unity 7.6 桌面了 | Linux 中国如何在 Arch Linux 中安装 Cinnamon 桌面 | Linux 中国电子数据取证学习笔记在 Mac 上运行 Linux 更进一步,Apple SoC CPUFreq 驱动即将并入 Linux 主线内核Linux 内核概念和学习路线天赋“易昺(bǐng)”,创造历史!(学习笔记)国务院机构组成及部门排序学习笔记:政治局会议和两会柿话柿说,柿柿留心(12)柿过竞迁、事过境迁美国急诊室护士学习笔记火遍全网,开网店卖笔记怒赚200万美元?!硬核观察 #848 Linux 6.1 发布,拉开 Rust 进入 Linux 内核的大幕Arch Linux 2023.01.01 版本 ISO 镜像发布:采用 Linux 内核 6.1世界上只有两个 Linux 发行版:Arch Linux 与其它 | Linux 中国Linux 6.1 内核被批准为长期支持版本 | Linux 中国Bodhi Linux 7.0.0 开始测试新的功能和软件包 | Linux 中国《深度合成管理规定》学习笔记(附逐条对比)干货!Linux 防火墙配置 ( iptables 和 firewalld )如何在 Arch Linux 中安装 OpenOffice(新手指南) | Linux 中国( 学习笔记 ) 中央经济工作会议精神急诊室护士学习笔记火遍全网,开网店卖笔记怒赚200万美元?!
logo
联系我们隐私协议©2024 redian.news
Redian新闻
Redian.news刊载任何文章,不代表同意其说法或描述,仅为提供更多信息,也不构成任何建议。文章信息的合法性及真实性由其作者负责,与Redian.news及其运营公司无关。欢迎投稿,如发现稿件侵权,或作者不愿在本网发表文章,请版权拥有者通知本网处理。