Update sched, irq, mm

freelancer-leon · freelancer-leon · commit f5c39dadadd5 · 2017-11-10T15:50:34.000+08:00
diff --git a/README.md b/README.md
@@ -8,7 +8,7 @@
 - [页缓存和页回写](https://github.com/freelancer-leon/notes/blob/master/kernel/page_cache.md)
 - [块I/O层（BIO）](https://github.com/freelancer-leon/notes/blob/master/kernel/bio.md)
 - [设备模型](https://github.com/freelancer-leon/notes/blob/master/kernel/dev_model.md)
-- [图形](https://github.com/freelancer-leon/notes/blob/master/kernel/graphic/Linux-Graphic.md)
+- [Linux 图形](https://github.com/freelancer-leon/notes/blob/master/kernel/graphic/Linux-Graphic.md)
 - [中断](https://github.com/freelancer-leon/notes/blob/master/kernel/irq.md)
 - [NMI](https://github.com/freelancer-leon/notes/blob/master/kernel/nmi)
 
@@ -38,6 +38,10 @@
 - [TCP](https://github.com/freelancer-leon/notes/blob/master/kernel/networking/TCP.md)
 - [macvlan](https://github.com/freelancer-leon/notes/blob/master/kernel/networking/macvlan.md)
 
+### Netfilter
+- [netfilter](https://github.com/freelancer-leon/notes/blob/master/kernel/networking/netfilter/netfilter.md)
+- [conntrack](https://github.com/freelancer-leon/notes/blob/master/kernel/networking/netfilter/conntrack.md)
+
 ## 初始化
 - [initcall](https://github.com/freelancer-leon/notes/blob/master/kernel/init/initcall.md)
 
diff --git a/kernel/Bottom_Half.md b/kernel/Bottom_Half.md
@@ -265,6 +265,13 @@ void irq_exit(void)
 ```
 
 ### `ksoftirqd`中断处理进程
+* 每个CPU一个的辅助处理softirq（和tasklet）的内核线程
+* 引入ksoftirqd的原因：
+  * 在中断处理函数返回时处理softirq是最常见的softirq处理时机
+  * softirq触发的频率有时很高，而有的softirq还会重新触发自己以便得到再次执行
+  * 为防止用户空间进程饥饿，作为折中的方案，内核不会立即处理重新触发的softirq
+  * 当大量softirq出现时，内核会唤醒一组内核线程来处理这些负载，即**ksoftirqd**
+* ksoftirqd每次迭代都会最终调用`schedule()`让其他进程有机会得到处理
 ```
 early_initcall(spawn_ksoftirqd)
 
@@ -598,57 +605,6 @@ EXPORT_SYMBOL(tasklet_init);
   * 注意，其他地方仍有可能将tasklet再度放回链表
   * 可能会引起休眠，**禁止在中断上下文使用**
 
-## ksoftirqd
-* 每个CPU一个的辅助处理softirq（和tasklet）的内核线程
-* 引入ksoftirqd的原因：
-  * 在中断处理函数返回时处理softirq是最常见的softirq处理时机
-  * softirq触发的频率有时很高，而有的softirq还会重新触发自己以便得到再次执行
-  * 为防止用户空间进程饥饿，作为折中的方案，内核不会立即处理重新触发的softirq
-  * 当大量softirq出现时，内核会唤醒一组内核线程来处理这些负载，即**ksoftirqd**
-* ksoftirqd每次迭代都会最终调用`schedule()`让其他进程有机会得到处理
-* kernel/softirq.c
-```c
-static int ksoftirqd_should_run(unsigned int cpu)
-{
-    return local_softirq_pending();
-}
-
-static void run_ksoftirqd(unsigned int cpu)
-{
-    local_irq_disable();
-    if (local_softirq_pending()) {
-        /*
-         * We can safely run softirq on inline stack, as we are not deep
-         * in the task stack here.
-         */
-        __do_softirq();
-        local_irq_enable();
-        cond_resched_rcu_qs();
-        return;
-    }
-    local_irq_enable();
-}
-...
-
-static struct smp_hotplug_thread softirq_threads = {
-    .store          = &ksoftirqd,
-    .thread_should_run  = ksoftirqd_should_run,
-    .thread_fn      = run_ksoftirqd,
-    .thread_comm        = "ksoftirqd/%u",
-};
-
-static __init int spawn_ksoftirqd(void)
-{
-    register_cpu_notifier(&cpu_nfb);
-
-    BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
-
-    return 0;
-}
-early_initcall(spawn_ksoftirqd);
-```
-
-
 # 工作队列（work queue)
 
 * 工作队列是将工作推后执行的一种方式，可以用来实现中断下半部
diff --git a/kernel/irq.md b/kernel/irq.md
@@ -74,3 +74,122 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
         return 1;
 }
 ```
+* arch/x86/entry/entry_64.S
+```c
+/*
+ * Build the entry stubs with some assembler magic.
+ * We pack 1 stub into every 8-byte block.
+ */
+        .align 8
+ENTRY(irq_entries_start)
+    vector=FIRST_EXTERNAL_VECTOR
+    .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
+        pushq   $(~vector+0x80)                 /* Note: always in signed byte range */
+    vector=vector+1
+        jmp     common_interrupt  /*跳转至x86通用的汇编中断处理*/
+        .align  8
+    .endr
+END(irq_entries_start)
+
+/*
+ * Interrupt entry/exit.
+ *
+ * Interrupt entry points save only callee clobbered registers in fast path.
+ *
+ * Entry runs with interrupts off.
+ */
+
+/* 0(%rsp): ~(interrupt number) */
+        .macro interrupt func
+        cld
+        ALLOC_PT_GPREGS_ON_STACK
+        SAVE_C_REGS
+        SAVE_EXTRA_REGS
+        ENCODE_FRAME_POINTER
+
+        testb   $3, CS(%rsp)
+        jz      1f   
+
+        /*   
+         * IRQ from user mode.  Switch to kernel gsbase and inform context
+         * tracking that we're in kernel mode.
+         */
+        SWAPGS
+
+        /*
+         * We need to tell lockdep that IRQs are off.  We can't do this until
+         * we fix gsbase, and we should do it before enter_from_user_mode
+         * (which can take locks).  Since TRACE_IRQS_OFF idempotent,
+         * the simplest way to handle it is to just call it twice if
+         * we enter from user mode.  There's no reason to optimize this since
+         * TRACE_IRQS_OFF is a no-op if lockdep is off.
+         */
+        TRACE_IRQS_OFF
+
+        CALL_enter_from_user_mode
+
+1:
+        /*
+         * Save previous stack pointer, optionally switch to interrupt stack.
+         * irq_count is used to check if a CPU is already on an interrupt stack
+         * or not. While this is essentially redundant with preempt_count it is
+         * a little cheaper to use a separate counter in the PDA (short of
+         * moving irq_enter into assembly, which would be too much work)
+         */
+        movq    %rsp, %rdi
+        incl    PER_CPU_VAR(irq_count)
+        cmovzq  PER_CPU_VAR(irq_stack_ptr), %rsp
+        pushq   %rdi
+        /* We entered an interrupt context - irqs are off: */
+        TRACE_IRQS_OFF
+
+        call    \func   /* rdi points to pt_regs */
+        .endm
+
+        /*
+         * The interrupt stubs push (~vector+0x80) onto the stack and
+         * then jump to common_interrupt.
+         */
+        .p2align CONFIG_X86_L1_CACHE_SHIFT
+common_interrupt:
+        ASM_CLAC
+        addq    $-0x80, (%rsp)                  /* Adjust vector to [-256, -1] range */
+        interrupt do_IRQ       /*跳转至x86通用的 C 中断处理，在上面列出了*/
+        /* 0(%rsp): old RSP */
+ret_from_intr:                 /*注意，这里是连着的，do_IRQ 返回后会接着执行后面的指令*/
+        DISABLE_INTERRUPTS(CLBR_ANY)
+        TRACE_IRQS_OFF
+        decl    PER_CPU_VAR(irq_count)
+
+        /* Restore saved previous stack */
+        popq    %rsp
+
+        testb   $3, CS(%rsp)  /*读寄存器，判断中断是返回到 user space 还是 kernel space*/
+        jz      retint_kernel
+
+        /* Interrupt came from user space */
+GLOBAL(retint_user)
+        mov     %rsp,%rdi
+        call    prepare_exit_to_usermode
+        TRACE_IRQS_IRETQ
+        SWAPGS
+        jmp     restore_regs_and_iret
+
+/* Returning to kernel space */
+retint_kernel:
+#ifdef CONFIG_PREEMPT
+        /* Interrupts are off */
+        /* Check if we need preemption */
+        bt      $9, EFLAGS(%rsp)                /* were interrupts off? */
+        jnc     1f
+0:      cmpl    $0, PER_CPU_VAR(__preempt_count) /*读取抢占计数，看能否进行内核抢占*/
+        jnz     1f                    /*如果抢占计数不为 0，通过跳转到 lable 1 返回原执行点*/
+        call    preempt_schedule_irq  /*如果抢占计数为 0，触发内核抢占，这里是内核抢占的一个点*/
+        jmp     0b   /* preempt_schedule_irq 返回后再次跳回 label 0 检查抢占计数 */
+1:
+#endif
+        /*
+         * The iretq could re-enable interrupts:
+         */
+        TRACE_IRQS_IRETQ
+```
diff --git a/kernel/mm/mm-1-process_addr_spc.md b/kernel/mm/mm-1-process_addr_spc.md
@@ -702,10 +702,26 @@ static inline pteval_t pte_flags(pte_t pte)
   * 对于链接到同一共享库的 **不同程序** 来说，则可以从共享库的映射中获得好处，因为同一共享库的`.text`只会映射到同一物理页。如，bash 和 vi 都链接到的 libc.so 共享同一物理页。
   * 用静态链接显然就没有这一好处了，静态链接实际上把内容合并到程序的二进制文件中，在运行不同程序时无法区分这些相同的内容，因此必然会消耗更多的内存。
 
+## 页表为什么要分级？
+* [Linux内核4级页表的演进](http://larmbr.com/2014/01/19/the-evolution-of-4-level-page-talbe-in-linux/)讲的蛮好
+* 简单地说，分级的目的是为了 **节省内存**
+  * 如果采用一个简单的大页表一对一映射虚拟地址到物理地址，需要较多的 **连续的物理页面(phsical page)**，因为是 **一对一** 映射，此时又未分级，所以必须是 **连续的**。
+  * 如果进程比较多，则需要大量的连续物理页，这不现实。
+  * 然而，在现实中，程序存在局部化特征, 这意味着在特定的时间内只有部分内存会被频繁访问，具体点，进程空间中的`.text`段(即程序代码), 堆， 共享库，栈都是固定在进程空间的某个特定部分，这样导致进程空间其实是非常稀疏的。
+  * 所谓 **分级** 简单说就是，把整个进程空间分成区块（block），区块下面可以再细分，这样在内存中只要常驻某个区块的页表即可，这样可以大量节省内存。
+  * 这里的套路是：
+    * 区块中的条目存的是下一级区块的 **基址** （物理地址）
+    * 将虚拟地址的不同位分别作为不同级别区块的 **索引**（也就是 **偏移**）
+    * 只要提供 *最上级页目录的基址* 和 *虚拟地址*，即可采用 **基址 + 偏移** 的方式逐级索引至物理地址。
+  * 每个区块所需的空间并不大，而进程空间中未使用的地址无需建立对应的区块，因此每个进程的页表用的空间并不多，这样目的就达到了。
+
 # 参考资料
 
 * [How the Kernel Manages Your Memory](http://duartes.org/gustavo/blog/post/how-the-kernel-manages-your-memory/)
 * [How Linux Kernel Manages Application Memory](http://techblog.cloudperf.net/2016/07/how-linux-kernel-manages-application_18.html)
 * [The Performance Impact of Kernel Prefetching on Buffer Cache Replacement Algorithms](http://www.cs.arizona.edu/projects/dream/papers/sigm05_prefetch.pdf)
 * [Linux Cache 机制探究](http://www.penglixun.com/tech/system/linux_cache_discovery.html)
 * [Using the Microprocessor MMU for Software Protection in Real-Time Systems](http://www.lynx.com/using-the-microprocessor-mmu-for-software-protection-in-real-time-systems/)
+* [Linux内核4级页表的演进](http://larmbr.com/2014/01/19/the-evolution-of-4-level-page-talbe-in-linux/)
+* [地址空间的归纳总结](http://alanwu.blog.51cto.com/3652632/1082195)
+* [PCI设备的地址空间](http://www.cnblogs.com/zszmhd/archive/2012/05/08/2490105.html)
diff --git a/kernel/networking/route.md b/kernel/networking/route.md
@@ -0,0 +1,9 @@
+# References
+- [Guide to IP Layer Network Administration with Linux - 4.8. Routing Tables](http://linux-ip.net/html/routing-tables.html)
+- [ip-route (8) - Linux Man Pages](https://www.systutorials.com/docs/linux/man/8-ip-route/)
+- [浅析Linux Kernel 哈希路由表实现(一)](http://basiccoder.com/intro-linux-kernel-hash-rt-1.html)
+- [浅析Linux Kernel 哈希路由表实现(二)](http://basiccoder.com/intro-linux-kernel-hash-rt-2.html)
+- [Linux kernel路由机制分析](http://lib.csdn.net/article/linux/37220)
+- [路由表 rtable](http://abcdxyzk.github.io/blog/2015/08/25/kernel-net-rtable/)
+- [路由的基本概念介绍](http://www.pagefault.info/?p=240)
+- [The Network Administrators' Guide - Displaying the Routing Table](http://www.tldp.org/LDP/nag/node75.html#SECTION007910000)
diff --git a/kernel/sched/sched_kernel_preempt.md b/kernel/sched/sched_kernel_preempt.md
@@ -12,6 +12,10 @@
 	- [PowerPC-32的实现](#PowerPC-32的实现)
 	- [preempt_schedule_irq()函数](#preempt_schedule_irq()函数)
 
+## 用户抢占发生的时机
+* 从系统调用返回用户空间时（在执行系统调用期间被抢占属于内核抢占，这里说的是执行用户态的代码时发生的抢占，比如说用户态的程序 A 切换到了用户态程序 B）
+* 从中断处理程序返回用户空间时
+
 ## 内核抢占发生的时机
 * 中断处理程序正在执行，且返回内核空间之前。
 * 内核代码再一次具有可抢占性的时候（例如，调用`preempt_enable()`的时候）。
diff --git a/kernel/sched/sched_linux.md b/kernel/sched/sched_linux.md
@@ -607,7 +607,7 @@ Per-CPU相关代码见：
 ### 计算优先级
 * `static_prio` 通常是优先级计算的起点
 * `prio` 是调度器关心的优先级，通常由`effective_prio()`计算，计算时考虑当前的优先级的值
-* `prio` 有可能会因为*非实时进程*要使用实时互斥量(RT-Mutex)而临时提高优先级至实时优先级
+* `prio` 有可能会因为 *非实时进程* 要使用实时互斥量(RT-Mutex)而临时提高优先级至实时优先级
 * `normal_prio` 通常由`normal_prio()`计算，计算时考虑调度策略的因素
 
 ```c
@@ -629,7 +629,7 @@ static inline int __normal_prio(struct task_struct *p)
 static inline int normal_prio(struct task_struct *p)
 {
     int prio;
-
+    /*基于进程静态优先级和调度策略计算优先级，不考虑优先级继承*/
     if (task_has_dl_policy(p))
         prio = MAX_DL_PRIO-1;
     else if (task_has_rt_policy(p))
@@ -656,12 +656,12 @@ static int effective_prio(struct task_struct *p)
      */
     if (!rt_prio(p->prio))
         return p->normal_prio;
-    return p->prio;
+    return p->prio; /*返回继承的 RT boosted 优先级*/
 }
 ```
 * `fork`子进程时
   * 子进程的静态优先级`static_prio`继承自父进程
-  * 动态优先级`prio`设置为父进程的普通优先级`normal_prio`。这是为了确保实时互斥量引起的优先级提高**不会**传递到子进程
+  * 动态优先级`prio`设置为父进程的普通优先级`normal_prio`。这是为了确保实时互斥量引起的优先级提高 **不会** 传递到子进程
 
 
 ## 创建进程
diff --git a/kernel/sched/sched_rt.md b/kernel/sched/sched_rt.md
@@ -841,7 +841,7 @@ void sched_avg_update(struct rq *rq)
 * kernel/sched/sched.h
 ```c
 static inline u64 sched_avg_period(void)
-{       /*周期的值是设定的 sysctl_sched_time_avg 的1/2*/
+{       /* 周期的值是设定的 sysctl_sched_time_avg 的1/2 */
         return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
 }
 ```
@@ -869,7 +869,7 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 {
         struct sched_rt_entity *rt_se;
         int cpu = cpu_of(rq_of_rt_rq(rt_rq)); /*实时调度队列所在的 CPU*/
-
+        /*rt_se 为该实时调度队列所属的实时调度实体*/
         rt_se = rt_rq->tg->rt_se[cpu];
         /*如果该调度实体存在则从队列上移除它，否则只对队列的顶层的一些数据进行变更*/
         if (!rt_se)
@@ -1118,6 +1118,7 @@ short int | bit code | unsigned short int | 5 bit maximum = 31
 * RT throttling 一旦出现什么时候会结束呢？换句话来说，实时调度队列积累的实时任务运行时间`rt_rq->rt_time`什么时候会减小？这是我们接下来要观察的问题。
 * 这里主要依赖的机制是高精度定时器，基本过程如下：
   * 该定时器在内核初始化时，由`sched_init()`根据不同的条件调用`init_rt_bandwidth()`初始化定时器，回调函数为`sched_rt_period_timer()`。
+  * 该定时器在该任务组没有进程时并不会工作，通过`/proc/timer_list`无法看到该定时器在排队。
   * 在有进程进入队列时，如果该队列的任务组的周期性定时器尚未启动，则会在此时启动。
   * 当定时器到期时调用注册的回调函数`sched_rt_period_timer()`进行检查。
 
@@ -1219,7 +1220,7 @@ static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
            hrtimer_forward_now() 返回 overrun = 0，循环结束。
         b) 某时刻 T1 为 12，此时旧定时器时刻 timer1 为 6，那么调用 hrtimer_forward_now()，
            新定时器时刻 timer2 为 6 + 8 = 14，overrun = 1。
-           由于某种原因 do_sched_rt_period_timer() 得到运行的时间过长（注意，它处于
+           由于某种原因 do_sched_rt_period_timer() 得到运行的时间过长（注意，它不在
            rt_b->rt_runtime_lock 保护的临界区），返回时时刻 T2 为 16，
            T2 在 timer2 之后，新定时器时间已经错过了，这种情况也需要处理。再次调用
            hrtimer_forward_now()，将更新定时器 14 + 8 = 22，返回 overrun = 1，需要下
@@ -1668,6 +1669,17 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 #define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
 ```
 
+### 问题：如果一个`SCHED_FIFO`进程被中断打断，抢占是开启的，不考虑进程迁移情况，也没有新的高优先级实时进程插进运行队列，中断退出时会否因为内核抢占而被换走？
+* 之所以会有这样一个疑问是因为在普通任务调度上会存在这样的情况：
+  * 对于普通任务调度来说，正在执行的任务是不在运行队列（红黑树）上的（尽管它的`sched_entity->on_rq`域的值 > 0）。
+  * 在`__schedule()`时，普通任务调度的`put_prev_task_fair()`会有一个入列后再遴选的动作，由于`vruntime`的增加，原进程很可能不再是最合适的任务了。
+* 对于实时调度，这个问题涉及到的一个关键细节是，正在执行的实时进程在不在实时运行队列上？
+  * 对于实时任务调度来说，正在执行的任务还在实时运行队列上，并没被移出过队列。
+  * 在`__schedule()`时，实时调度的`put_prev_task_rt()`并没有操作实时运行队列。
+  * 所以如果不考虑进程迁移和新的高优先级实时进程插进运行队列的情况，`pick_next_rt_entity()`选中的仍然是原来位置上的任务，也就是原进程。
+* 如果抢占是关闭的，无论是普通任务还是实时任务，即使有更合适的任务（对于普通任务调度，有`vruntime`更小的任务；对于实时调度，有高优先的任务进入运行队列），也不会发生内核抢占。
+  * 记得，`preempt_enable()`是内核抢占的一个点，所以高优先任务会在低优先级任务开启抢占的时候被调度。
+
 # 进程退出
 * 实时任务的退出队列时从`dequeue_task_rt()`开始看起。
 ```c
diff --git a/kernel/sched/sched_unix.md b/kernel/sched/sched_unix.md
@@ -20,7 +20,7 @@
 
 
 # 公平共享调度（Fair Share Scheduling）
-###公平共享调度的原则
+### 公平共享调度的原则
 * 将用户团体分为一些公平共享组
 * 每组成员受到常规的进程调度的限制，即组内成员公平共享
 * CPU时间按比例分配给每个组，不考虑组中成员有多少
diff --git a/kernel/time.md b/kernel/time.md