分析Linux內(nèi)核創(chuàng)建一個新進程的過程

曹朋輝
原創(chuàng)作品轉(zhuǎn)載請注明出處
《Linux內(nèi)核分析》MOOC課程

內(nèi)核里操作系統(tǒng)的三大功能:
內(nèi)存管理
進程管理
文件系統(tǒng)
其中最核心的是進程管理

進程描述符task_struct數(shù)據(jù)結(jié)構(gòu)

進程控制塊PCB——task_struct
為了管理進程,內(nèi)核必須對每個進程進行清晰的描述,進程描述符提供了內(nèi)核所需了解的進程信息。
struct task_struct數(shù)據(jù)結(jié)構(gòu)很龐大
Linux進程的狀態(tài)與操作系統(tǒng)原理中的描述的進程狀態(tài)似乎有所不同,比如就緒狀態(tài)和運行狀態(tài)都是TASK_RUNNING,為什么呢?
進程的標(biāo)示pid
所有進程鏈表struct list_head tasks;
內(nèi)核的雙向循環(huán)鏈表的實現(xiàn)方法 - 一個更簡略的雙向循環(huán)鏈表
程序創(chuàng)建的進程具有父子關(guān)系,在編程時往往需要引用這樣的父子關(guān)系。進程描述符中有幾個域用來表示這樣的關(guān)系
Linux為每個進程分配一個8KB大小的內(nèi)存區(qū)域,用于存放該進程兩個不同的數(shù)據(jù)結(jié)構(gòu):Thread_info和進程的內(nèi)核堆棧
進程處于內(nèi)核態(tài)時使用,?不同于用戶態(tài)堆棧,即PCB中指定了內(nèi)核棧,那為什么PCB中沒有用戶態(tài)堆棧?用戶態(tài)堆棧是怎么設(shè)定的?
內(nèi)核控制路徑所用的堆棧?很少,因此對棧和Thread_info?來說,8KB足夠了
struct thread_struct thread; //CPU-specific state of this task
文件系統(tǒng)和文件描述符
內(nèi)存管理——進程的地址空間

task_struct數(shù)據(jù)結(jié)構(gòu)1235-1644
task_struct數(shù)據(jù)結(jié)構(gòu)總覽
linux進程狀態(tài)轉(zhuǎn)換圖
struct task_struct {
1236    volatile long state;    運行狀態(tài)/* -1 unrunnable, 0 runnable, >0 stopped */
1237    void *stack;  進程的內(nèi)核堆棧
1238    atomic_t usage;   
1239    unsigned int flags; /* per process flags, defined below */
1240    unsigned int ptrace;
#ifdef CONFIG_SMP  多處理器時會用到
1243    struct llist_node wake_entry;
1244    int on_cpu;
1245    struct task_struct *last_wakee;
1246    unsigned long wakee_flips;
1247    unsigned long wakee_flip_decay_ts;
1248
1249    int wake_cpu;
1250#endif
//下面一段和優(yōu)先級,調(diào)度相關(guān)
1251        int on_rq;
1252
1253    int prio, static_prio, normal_prio;
1254    unsigned int rt_priority;
1255    const struct sched_class *sched_class;
1256    struct sched_entity se;
1257    struct sched_rt_entity rt;
1258#ifdef CONFIG_CGROUP_SCHED
1259    struct task_group *sched_task_group;
1260#endif
1261    struct sched_dl_entity dl;


1295        struct list_head tasks;  進程鏈表 
1296#ifdef CONFIG_SMP
1297    struct plist_node pushable_tasks;
1298    struct rb_node pushable_dl_tasks;
1299#endif
1300
1301    struct mm_struct *mm, *active_mm;  內(nèi)存管理進程的地址空間相關(guān) 
1302#ifdef CONFIG_COMPAT_BRK
1303    unsigned brk_randomized:1;
1304#endif
1305    /* per-thread vma caching */
1306    u32 vmacache_seqnum;
1307    struct vm_area_struct *vmacache[VMACACHE_SIZE];
1308#if defined(SPLIT_RSS_COUNTING)
1309    struct task_rss_stat    rss_stat;
1310#endif


/* Revert to default priority/policy when forking */
1325    unsigned sched_reset_on_fork:1;
1326    unsigned sched_contributes_to_load:1;
1327
1328    unsigned long atomic_flags; /* Flags needing atomic access. */
1329
1330    pid_t pid;  進程的pid
1331    pid_t tgid;
1332
1333#ifdef CONFIG_CC_STACKPROTECTOR
1334    /* Canary value for the -fstack-protector gcc feature */
1335    unsigned long stack_canary;
1336#endif


//下面一段為進程的父子關(guān)系
    struct task_struct __rcu *real_parent; /* real parent process */
1343    struct task_struct __rcu *parent; /* recipient of SIGCHLD, wait4() reports */
1344    /*
1345     * children/sibling forms the list of my natural children
1346     */
1347    struct list_head children;  /* list of my children */
1348    struct list_head sibling;   /* linkage in my parent's children list */
1349    struct task_struct *group_leader;   /* threadgroup leader */
1350
1351    /*
1352     * ptraced is the list of tasks this task is using ptrace on.
1353     * This includes both natural children and PTRACE_ATTACH targets.
1354     * p->ptrace_entry is p's link on the p->parent->ptraced list.
1355     */
1356    struct list_head ptraced;    調(diào)試用的
1357    struct list_head ptrace_entry;
1358
1359    /* PID/PID hash table linkage. */
1360    struct pid_link pids[PIDTYPE_MAX];  pid的哈希表   可以方便查找

1361    struct list_head thread_group;
1362    struct list_head thread_node;
1363


一下一段為時間相關(guān)的數(shù)據(jù)結(jié)構(gòu)
    cputime_t utime, stime, utimescaled, stimescaled;
1369    cputime_t gtime;
1370#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
1371    struct cputime prev_cputime;
1372#endif
1373#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
1374    seqlock_t vtime_seqlock;
1375    unsigned long long vtime_snap;
1376    enum {
1377        VTIME_SLEEPING = 0,
1378        VTIME_USER,
1379        VTIME_SYS,
1380    } vtime_snap_whence;
1381#endif
1382    unsigned long nvcsw, nivcsw; /* context switch counts */
1383    u64 start_time;     /* monotonic time in nsec */
1384    u64 real_start_time;    /* boot based time in nsec */
1385/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
1386    unsigned long min_flt, maj_flt;
1387
1388    struct task_cputime cputime_expires;
1389    struct list_head cpu_timers[3];
1390

/* process credentials */
1392    const struct cred __rcu *real_cred; /* objective and real subjective task
1393                     * credentials (COW) */
1394    const struct cred __rcu *cred;  /* effective (overridable) subjective task
1395                     * credentials (COW) */
1396    char comm[TASK_COMM_LEN]; /* executable name excluding path
1397                     - access with [gs]et_task_comm (which lock
1398                       it with task_lock())
1399                     - initialized normally by setup_new_exec */
1400/* file system info */
1401    int link_count, total_link_count;
1402#ifdef CONFIG_SYSVIPC
1403/* ipc stuff */
1404    struct sysv_sem sysvsem;
1405    struct sysv_shm sysvshm;
1406#endif
1407#ifdef CONFIG_DETECT_HUNG_TASK
1408/* hung task detection */
1409    unsigned long last_switch_count;
1410#endif
1411/* CPU-specific state of this task */
1412    struct thread_struct thread;    和當(dāng)前任務(wù)cpu相關(guān)的一些狀態(tài),與之前my_kernelvs中自己定義的PCB相似,在進程切換時起著關(guān)鍵作用

1413/* filesystem information */
1414    struct fs_struct *fs;   文件系統(tǒng)
1415/* open file information */
1416    struct files_struct *files;  打開的文件描述符列表
1417/* namespaces */
1418    struct nsproxy *nsproxy;
1419/* signal handlers */
1420    struct signal_struct *signal;  信號處理相關(guān)
1421    struct sighand_struct *sighand;
1422
1423    sigset_t blocked, real_blocked;
1424    sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */
1425    struct sigpending pending;

fork一個子進程的代碼

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
int main(int argc, char * argv[])
{
    int pid;
    /* fork another process */
    pid = fork();
    if (pid < 0) 
    { 
        /* error occurred */
        fprintf(stderr,"Fork Failed!");
        exit(-1);
    } 
    else if (pid == 0) 
    {
        /* child process */
        printf("This is Child Process!\n");
    } 
    else 
    {  
        /* parent process  */
        printf("This is Parent Process!\n");
        /* parent will wait for the child to complete*/
        wait(NULL);
        printf("Child Complete!\n");
    }
}

創(chuàng)****建一個新進程在內(nèi)核中的執(zhí)行過程
fork、vfork和clone三個系統(tǒng)調(diào)用都可以創(chuàng)建一個新進程,而且都是通過調(diào)用do_fork來實現(xiàn)進程的創(chuàng)建;
Linux通過復(fù)制父進程來創(chuàng)建一個新進程,那么這就給我們理解這一個過程提供一個想象的框架:
復(fù)制一個PCB——task_struct
err = arch_dup_task_struct(tsk, orig);
要給新進程分配一個新的內(nèi)核堆棧

tsk->stack = ti;
setup_thread_stack(tsk, orig); //這里只是復(fù)制thread_info,而非復(fù)制內(nèi)核堆棧

要修改復(fù)制過來的進程數(shù)據(jù),比如pid、進程鏈表等等都要改改吧,見copy_process內(nèi)部。
從用戶態(tài)的代碼看fork();函數(shù)返回了兩次,即在父子進程中各返回一次,父進程從系統(tǒng)調(diào)用中返回比較容易理解,子進程從系統(tǒng)調(diào)用中返回,那它在系統(tǒng)調(diào)用處理過程中的哪里開始執(zhí)行的呢?這就涉及子進程的內(nèi)核堆棧數(shù)據(jù)狀態(tài)和task_struct中thread記錄的sp和ip的一致性問題,這是在哪里設(shè)定的?copy_thread in copy_process
*childregs = *current_pt_regs(); //復(fù)制內(nèi)核堆棧
childregs->ax = 0; //為什么子進程的fork返回0,這里就是原因!
p->thread.sp = (unsigned long) childregs; //調(diào)度到子進程時的內(nèi)核棧頂
p->thread.ip = (unsigned long) ret_from_fork; //調(diào)度到子進程時的第一條指令地址



![進程創(chuàng)建](http://upload-images.jianshu.io/upload_images/10820-43294157a9ce870c.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)


![一般系統(tǒng)調(diào)用圖解](http://upload-images.jianshu.io/upload_images/10820-d8ee9a6c3ebab641.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)


![fork系統(tǒng)調(diào)用圖解](http://upload-images.jianshu.io/upload_images/10820-5ca527cb133ea9d8.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)


![創(chuàng)建一個新進程在內(nèi)核中的執(zhí)行過程](http://upload-images.jianshu.io/upload_images/10820-0df97579f5c6dc72.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)

創(chuàng)建進程的大致框架
復(fù)制父進程的PCB
修改復(fù)制的PCB
分配一個新的內(nèi)核堆棧
copy原來的內(nèi)核堆棧


創(chuàng)建進程調(diào)用do_fork

![Paste_Image.png](http://upload-images.jianshu.io/upload_images/10820-de373ccb53353ee4.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
do_fork中用copy_process包含創(chuàng)建一個進程的主要代碼

![copy_process](http://upload-images.jianshu.io/upload_images/10820-6ab5eb88b4fb8837.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)


![復(fù)制task_struct](http://upload-images.jianshu.io/upload_images/10820-d1416ef3511e3181.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)


![Paste_Image.png](http://upload-images.jianshu.io/upload_images/10820-fbfd2deb96628fba.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)


![修改PCB](http://upload-images.jianshu.io/upload_images/10820-d1715187ce3d737d.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)


![實驗截圖](http://upload-images.jianshu.io/upload_images/10820-fafd2132d9200058.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)


![設(shè)置斷點](http://upload-images.jianshu.io/upload_images/10820-38adcf0d7b3c6f4a.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
最后編輯于
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時請結(jié)合常識與多方信息審慎甄別。
平臺聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點,簡書系信息發(fā)布平臺,僅提供信息存儲服務(wù)。

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容