|
| 1 | +# Kexec |
| 2 | + |
| 3 | +## 几种 Linux 内核镜像文件的区别: |
| 4 | +1. `vmlinux`:编译出来的最原始的内核文件,未压缩。 |
| 5 | +2. `zImage` :是`vmlinux`经过一系列处理(包括`gzip`压缩)后的文件。 |
| 6 | +3. `bzImage`:`bz`表示“big zImage”,不是用`bzip2`压缩的。两者的不同之处在于: |
| 7 | + * `zImage`解压缩内核到低端内存(第一个`640K`), |
| 8 | + * `bzImage`解压缩内核到高端内存(`1M`以上)。如果内核比较小,那么采用`zImage`或`bzImage`都行,如果比较大应该用`bzImage`。 |
| 9 | +4. `uImage` :U-boot 专用的映像文件,它是在`zImage`之前加上一个长度为`0x40`的 tag(`64`个字节,说明这个映像文件的类型、加载位置、生成时间、大小等信息)。其实就是一个自动跟手动的区别, |
| 10 | + * 有了`uImage`头部的描述,U-boot 就知道对应 Image 的信息,如果没有头部则需要自己手动去搞那些参数。 |
| 11 | + * 换句话说,如果直接从`uImage`的`0x40`位置开始执行,`zImage`和`uImage`没有任何区别。 |
| 12 | +5. `vmlinuz`:是`bzImage`/`zImage`文件的拷贝或指向`bzImage`/`zImage`的链接。 |
| 13 | +6. `initrd` :是“initial ramdisk”的简写。一般被用来临时的引导硬件到实际内核`vmlinuz`能够接管并继续引导的状态。 |
| 14 | +7. `fitImage`:Flattened uImage Tree,根据 image source file(.its) 将 dtb、kernel image、initrd 打包而成的`.itb`镜像文件。 |
| 15 | +8. `Image|vmlinux.bin` |
| 16 | + * 未配置压缩内核镜像时用类似`objcopy -O binary -R .note -R .comment -R .note.gnu.build-id -S vmlinux`命令生成的镜像文件。 |
| 17 | + * 配置压缩内核镜像时,先在`arch/[arch]/boot/compressed`目录下生成`Image|vmlinux.bin`,再压缩成`.gz|.bz2`文件,再加上一些解压信息,比如`piggy.S`,再链接,最终生成`arch/<arch>/boot/`下的`Image|vmlinux.bin` |
| 18 | +9. `Image.[gz|bz2]/vmlinux.bin.[gz|bz2]`:可以理解为,仅是对瘦身后的内核镜像再压缩后得到的中间产物,通常还有随后的处理。 |
| 19 | + * 配置了`CONFIG_KERNEL_GZIP`或`CONFIG_KERNEL_BZIP2`,会对`arch/<arch>/boot/compressed`目录的`Image/vmlinux.bin`文件进行压缩。 |
| 20 | + |
| 21 | +## 相关配置 |
| 22 | +* Kernel Configurations |
| 23 | + ```sh |
| 24 | + # CONFIG_KEXEC depends on: PM_SLEEP_SMP [=y] |
| 25 | + CONFIG_SUSPEND=y |
| 26 | + CONFIG_PM_SLEEP=y |
| 27 | + CONFIG_PM_SLEEP_SMP=y |
| 28 | + # Enable the kexec syscall |
| 29 | + CONFIG_KEXEC=y |
| 30 | + # Generate crash dump after being started by kexec. |
| 31 | + CONFIG_CRASH_DUMP=y |
| 32 | + # Exports the dump image of crashed kernel in ELF format. |
| 33 | + CONFIG_PROC_VMCORE=y |
| 34 | + CONFIG_DEBUG_INFO=y |
| 35 | + ``` |
| 36 | +* /etc/sysconfig/kdump |
| 37 | + ``` |
| 38 | + KDUMP_COMMANDLINE_APPEND="maxcpus=1 " |
| 39 | + ``` |
| 40 | +## kexec |
| 41 | +* kexec 相关的三个系统调用 |
| 42 | + * `kexec_load(unsigned long entry, unsigned long nr_segments, struct kexec_segment *segments, unsigned long flags)` |
| 43 | + * `kexec_load_file()` |
| 44 | + * `reboot(LINUX_REBOOT_CMD_KEXEC)` |
| 45 | +* 可以从`/proc/iomem`读到主内核启动时配置`crashkernel`而保留的内存地址范围 |
| 46 | + |
| 47 | +### kexec 加载内核 |
| 48 | +#### 用户态 kexec-tools |
| 49 | +```c |
| 50 | +main() |
| 51 | +-> arch_process_options() |
| 52 | +if (do_kexec_file_syscall) |
| 53 | +-> do_kexec_file_load() |
| 54 | +if (!do_kexec_file_syscall) |
| 55 | +-> my_load() |
| 56 | + +-> kernel_buf = slurp_decompress_file() |
| 57 | + | -> slurp_file() |
| 58 | + | -> slurp_file_generic() |
| 59 | + | -> slurp_fd() |
| 60 | + | -> buf = xmalloc(size) // kernel_buf 会指向这 |
| 61 | + +-> get_memory_ranges() |
| 62 | + | -> get_memory_ranges_iomem() |
| 63 | + | -> kexec_iomem_for_each_line(NULL, get_memory_ranges_iomem_cb, array) |
| 64 | + | -> proc_iomem() |
| 65 | + | return proc_iomem_str; //"/proc/iomem" |
| 66 | + | -> callback(data, nr, str, start, size) |
| 67 | + | => get_memory_ranges_iomem_cb() //00000000-01ffffff : System RAM 等多条数据 |
| 68 | + | -> set_phys_offset(r->start) |
| 69 | + +-> file_type[i].probe(kernel_buf, kernel_size) |
| 70 | + | => image_arm64_probe() // -t Image |
| 71 | + +-> physical_arch() //根据`uname()`的返回结果确定当前系统的 ARCH 的类型 |
| 72 | + +-> file_type[i].load(argc, argv, kernel_buf, kernel_size, &info) |
| 73 | + => image_arm64_load() // -t Image |
| 74 | + -> arm64_process_image_header() |
| 75 | + -> arm64_header_check_magic() //检查`kernel_buf`的`magic[4]`是不是`"ARM\x64"` |
| 76 | + -> arm64_mem.text_offset = arm64_header_text_offset(h) |
| 77 | + -> arm64_mem.image_size = arm64_header_image_size(h) |
| 78 | + -> arm64_locate_kernel_segment() |
| 79 | +-> my_exec() |
| 80 | + -> reboot(LINUX_REBOOT_CMD_KEXEC) |
| 81 | +``` |
| 82 | +* `slurp_fd()`将指定的文件(也可以是字符设备或块设备)读入到新分配的内存里 |
| 83 | +* 这个路径上的`get_memory_ranges()`从`/proc/iomem`读取`System RAM`的范围 |
| 84 | +
|
| 85 | +#### 内核态 kexec |
| 86 | +
|
| 87 | +### kexec 切换内核 |
| 88 | +```c |
| 89 | +kernel/reboot.c |
| 90 | +SYSCALL_DEFINE4(reboot,...) |
| 91 | + case LINUX_REBOOT_CMD_KEXEC: |
| 92 | + -> kernel_kexec() |
| 93 | + -> machine_kexec(kexec_image) |
| 94 | +``` |
| 95 | +* arch/arm64/include/asm/sysreg.h |
| 96 | + ```c |
| 97 | + /* |
| 98 | + * The "Z" constraint normally means a zero immediate, but when combined with |
| 99 | + * the "%x0" template means XZR. |
| 100 | + */ |
| 101 | + #define write_sysreg(v, r) do { \ |
| 102 | + u64 __val = (u64)(v); \ |
| 103 | + asm volatile("msr " __stringify(r) ", %x0" \ |
| 104 | + : : "rZ" (__val)); \ |
| 105 | + } while (0) |
| 106 | + ...__ |
| 107 | + ``` |
| 108 | +
|
| 109 | +* arch/arm64/include/asm/mmu_context.h |
| 110 | + ```c |
| 111 | + /* |
| 112 | + * Set TTBR0 to empty_zero_page. No translations will be possible via TTBR0. |
| 113 | + */ |
| 114 | + static inline void cpu_set_reserved_ttbr0(void) |
| 115 | + { |
| 116 | + unsigned long ttbr = phys_to_ttbr(__pa_symbol(empty_zero_page)); |
| 117 | +
|
| 118 | + write_sysreg(ttbr, ttbr0_el1); |
| 119 | + isb(); /*asm volatile("isb" : : : "memory")*/ |
| 120 | + } |
| 121 | + ``` |
| 122 | + * `isb` 是指令同步屏障(Instruction Synchronize Barrier),该指令冲刷处理器的流水线,重新读取屏障指令后面的所有指令。 |
| 123 | + * **TTBR** 是 “Translation Table Base Register” 的缩写,表示转换表基准寄存器。 |
| 124 | + * **EL1** 是 ”Exception Level 1” 的缩写,表示异常级别 1。 |
| 125 | + * 寄存器`TTBR1_EL1`存放 *内核的页全局目录* 的物理地址。 |
| 126 | + * 寄存器`TTBR0_EL1`存放 *进程的页全局目录* 的物理地址。 |
| 127 | + |
| 128 | +* arch/arm64/include/asm/mmu_context.h |
| 129 | + ```c |
| 130 | + static inline void cpu_switch_mm(pgd_t *pgd, struct mm_struct *mm) |
| 131 | + { |
| 132 | + BUG_ON(pgd == swapper_pg_dir); |
| 133 | + cpu_set_reserved_ttbr0(); |
| 134 | + cpu_do_switch_mm(virt_to_phys(pgd),mm); |
| 135 | + } |
| 136 | + |
| 137 | + static inline void cpu_install_idmap(void) |
| 138 | + { |
| 139 | + cpu_set_reserved_ttbr0(); |
| 140 | + local_flush_tlb_all(); |
| 141 | + cpu_set_idmap_tcr_t0sz(); |
| 142 | + |
| 143 | + cpu_switch_mm(lm_alias(idmap_pg_dir), &init_mm); |
| 144 | + } |
| 145 | + ``` |
| 146 | +
|
| 147 | +* arch/arm64/include/asm/sysreg.h |
| 148 | + ```c |
| 149 | + /* Common SCTLR_ELx flags. */ |
| 150 | + #define SCTLR_ELx_EE (1 << 25) |
| 151 | + #define SCTLR_ELx_IESB (1 << 21) |
| 152 | + #define SCTLR_ELx_WXN (1 << 19) |
| 153 | + #define SCTLR_ELx_I (1 << 12) |
| 154 | + #define SCTLR_ELx_SA (1 << 3) |
| 155 | + #define SCTLR_ELx_C (1 << 2) |
| 156 | + #define SCTLR_ELx_A (1 << 1) |
| 157 | + #define SCTLR_ELx_M 1 |
| 158 | +
|
| 159 | + #define SCTLR_ELx_FLAGS (SCTLR_ELx_M | SCTLR_ELx_A | SCTLR_ELx_C | \ |
| 160 | + SCTLR_ELx_SA | SCTLR_ELx_I | SCTLR_ELx_IESB) |
| 161 | + ``` |
| 162 | +* arch/arm64/kernel/cpu-reset.S |
| 163 | + ```nasm |
| 164 | + .text |
| 165 | + .pushsection .idmap.text, "awx" |
| 166 | +
|
| 167 | + /* |
| 168 | + * __cpu_soft_restart(el2_switch, entry, arg0, arg1, arg2) - Helper for |
| 169 | + * cpu_soft_restart. |
| 170 | + * |
| 171 | + * @el2_switch: Flag to indicate a swich to EL2 is needed. |
| 172 | + * @entry: Location to jump to for soft reset. |
| 173 | + * arg0: First argument passed to @entry. |
| 174 | + * arg1: Second argument passed to @entry. |
| 175 | + * arg2: Third argument passed to @entry. |
| 176 | + * |
| 177 | + * Put the CPU into the same state as it would be if it had been reset, and |
| 178 | + * branch to what would be the reset vector. It must be executed with the |
| 179 | + * flat identity mapping. |
| 180 | + */ |
| 181 | + ENTRY(__cpu_soft_restart) |
| 182 | + /* Clear sctlr_el1 flags. */ |
| 183 | + mrs x12, sctlr_el1 ;x12 = sctlr_el1 |
| 184 | + ldr x13, =SCTLR_ELx_FLAGS ;x13 设为系统控制寄存器标志位 |
| 185 | + bic x12, x12, x13 ;x12 = x12 & ~x13,清空标志位 |
| 186 | + pre_disable_mmu_workaround |
| 187 | + msr sctlr_el1, x12 ;关闭 MMU |
| 188 | + isb |
| 189 | +
|
| 190 | + cbz x0, 1f // el2_switch? |
| 191 | + mov x0, #HVC_SOFT_RESTART |
| 192 | + hvc #0 // no return |
| 193 | +
|
| 194 | + 1: mov x18, x1 // entry ;reboot_code_buffer_phys 即 arm64_relocate_new_kernel |
| 195 | + mov x0, x2 // arg0 ;kimage->head |
| 196 | + mov x1, x3 // arg1 ;kimage->start |
| 197 | + mov x2, x4 // arg2 ;kimage->arch.dtb_mem |
| 198 | + br x18 |
| 199 | + ENDPROC(__cpu_soft_restart) |
| 200 | +
|
| 201 | + .popsection |
| 202 | + ``` |
| 203 | + * 该 routine 放入`.idmap.text`节,因为包含关闭 MMU 的操作 |
| 204 | + * **SCTLR_EL1** 为系统控制寄存器 |
| 205 | + * 所以关闭 MMU 后,第一段要执行的 routine 是`arm64_relocate_new_kernel` |
| 206 | + * `arm64_relocate_new_kernel`的主要工作是 |
| 207 | + * 设置 dtb 的地址为`x0`,即第一个参数 |
| 208 | + * 跳转到`kimage->start`,记得在`kexec_alloc_init()`时`kexec_image->start`设为`entry` |
| 209 | + * 最后跳转到`entry`,一直往前追溯,`entry`为`kexec_load()`用户态传入的地址,在 kexec-tools 中被设置为 `purgatory_start`的地址 |
| 210 | + |
| 211 | +* arch/arm64/kernel/cpu-reset.h |
| 212 | + ```c |
| 213 | + static inline void __noreturn cpu_soft_restart(unsigned long el2_switch, |
| 214 | + unsigned long entry, unsigned long arg0, unsigned long arg1, |
| 215 | + unsigned long arg2) |
| 216 | + { |
| 217 | + typeof(__cpu_soft_restart) *restart; |
| 218 | + |
| 219 | + el2_switch = el2_switch && !is_kernel_in_hyp_mode() && |
| 220 | + is_hyp_mode_available(); |
| 221 | + restart = (void *)__pa_symbol(__cpu_soft_restart); |
| 222 | + |
| 223 | + cpu_install_idmap(); |
| 224 | + restart(el2_switch, entry, arg0, arg1, arg2); |
| 225 | + unreachable(); |
| 226 | + } |
| 227 | + ...__``` |
| 228 | + ``` |
| 229 | +
|
| 230 | +#### 恒等映射(flat identity mapping) |
| 231 | +* **恒等映射** 这里与常说的 *一致映射、线性映射、直接映射* 不是一个概念。 |
| 232 | +* 恒等映射的特点是虚拟地址和物理地址相同,是为了在开始处理器的 MMU 的一瞬间能够平滑过渡。 |
| 233 | +* 恒等映射是为恒等映射代码节(`.idmap.text`)创建的映射,`idmap_pg_dir`是恒等映射的页全局目录(即第一级页表,pgd)的起始地址(当然是物理地址)。 |
| 234 | +
|
| 235 | +### Purgatory |
| 236 | +* purgatory/arch/arm64/entry.S |
| 237 | + ```nasm |
| 238 | + /* |
| 239 | + * ARM64 purgatory. |
| 240 | + */ |
| 241 | +
|
| 242 | + .macro size, sym:req ;计算符号大小的汇编宏 |
| 243 | + .size \sym, . - \sym ;“.”为当前位置,“\sym”替换为符号,即符号的起始位置,它们的差值即为符号大小 |
| 244 | + .endm |
| 245 | +
|
| 246 | + .text |
| 247 | +
|
| 248 | + .globl purgatory_start |
| 249 | + purgatory_start: |
| 250 | +
|
| 251 | + adr x19, .Lstack |
| 252 | + mov sp, x19 |
| 253 | +
|
| 254 | + bl purgatory |
| 255 | +
|
| 256 | + /* Start new image. */ |
| 257 | + ldr x17, arm64_kernel_entry |
| 258 | + ldr x0, arm64_dtb_addr |
| 259 | + mov x1, xzr |
| 260 | + mov x2, xzr |
| 261 | + mov x3, xzr |
| 262 | + br x17 ;跳转到 arm64_kernel_entry,第一个参数为 arm64_dtb_addr |
| 263 | +
|
| 264 | + size purgatory_start |
| 265 | + ``` |
| 266 | + * `purgatory_start`先调用`purgatory` |
| 267 | + * 再跳转到真正的内核入口地址`arm64_kernel_entry`,第一个参数为设备树的地址`arm64_dtb_addr` |
| 268 | + * 剩下三个参数寄存器被填充为 0 |
| 269 | + |
| 270 | +# References |
| 271 | +- [深入探索 Kdump,第 1 部分:带你走进 Kdump 的世界](https://www.ibm.com/developerworks/cn/linux/l-cn-kdump1/index.html) |
| 272 | +- [深入探索 Kdump,第 2 部分:Kdump 图形化配置工具](https://www.ibm.com/developerworks/cn/linux/l-cn-kdump2/index.html) |
| 273 | +- [深入探索 Kdump,第 3 部分:Kdump 原理探秘](https://www.ibm.com/developerworks/cn/linux/l-cn-kdump3/index.html) |
| 274 | +- [深入探索 Kdump,第 4 部分:kdump 的亲密战友 crash](https://www.ibm.com/developerworks/cn/linux/l-cn-kdump4/index.html) |
| 275 | +- [kexec - A travel to the purgatory](https://eastrivervillage.com/kexec-tools-with-the-hidden-purgatory/) |
| 276 | +- [u-boot FIT image介绍](http://www.wowotech.net/u-boot/fit_image_overview.html) |
0 commit comments