copyright_author: 科学边界
copyright_author_href: https://www.daodaodao123.com/
copyright_info: 此文章版权归科学边界所有,如有转载,请注明来自原作者
copyright_url: https://www.daodaodao123.com/?p=715


首先是引导程序,即 bootloader,简单说即 bootloader 会做如下事情:
(1)初始化物理内存;
(2)设置设备树;
(3)解压缩内核映像,将其加载到内核运行地址(可选);
(4)跳转到内核入口地址; 下面进入 Linux 范畴:

一、内核的起始地址

第一个要看的文件,“arch/arm64/kernel/vmlinux.lds.S”,Linux内核的链接脚本。

OUTPUT_ARCH(aarch64)  /// 编译目标文件格式为 aarch64
ENTRY(_text)          /// 内核入口地址

Linux 内核的内存布局定义

 /***************************************************************************
 * 内核的内存布局:*
 * 包括代码段(.text),只读数据段(.rodata),初始化数据段(.init.), .bss 段等
 * 几个常见的地址在 arch/arm64/mm/init.c 加了打印
 *
 **************************************************************************/
SECTIONS
{
    . = KIMAGE_VADDR;  /// 内核的起始链接地址,...

	.head.text : {
		_text = .;     ///内核入口地址
		HEAD_TEXT
	}
	.text : ALIGN(SEGMENT_ALIGN) {	/* Real text segment		*/
		_stext = .;		/* Text and read-only data	*/
			IRQENTRY_TEXT
			SOFTIRQENTRY_TEXT
			ENTRY_TEXT
			TEXT_TEXT
			SCHED_TEXT
			CPUIDLE_TEXT
			LOCK_TEXT
			KPROBES_TEXT
			HYPERVISOR_TEXT
			IDMAP_TEXT
			HIBERNATE_TEXT
			TRAMP_TEXT
			*(.fixup)
			*(.gnu.warning)
		. = ALIGN(16);
		*(.got)			/* Global offset table		*/
	}
}

这里我们追寻一下我司的一个高通的项目sm4450平台的,使用readelf读取vmlinux可知
从vmlinux.lds.S中的section可知,KIMAGE_VADDR地址对应的就是readelf输出结果中的[ 1] .head.text,所以KIMAGE_VADDR为0xffffffc008000000

llvm-readelf.exe -S D:\dump\vmlinux\vmlinux
There are 53 section headers, starting at offset 0x1c8612f0:

Section Headers:
  [Nr] Name              Type            Address          Off    Size   ES Flg Lk Inf Al
  [ 0]                   NULL            0000000000000000 000000 000000 00      0   0  0
  [ 1] .head.text        PROGBITS        ffffffc008000000 010000 010000 00  AX  0   0 65536
  [ 2] .text             PROGBITS        ffffffc008010000 020000 14131d8 00 WAX  0   0 4096
  [ 3] .rodata           PROGBITS        ffffffc009430000 1440000 aa45b3 00 WAMS  0   0 4096
  [ 4] .rodata1          PROGBITS        ffffffc009ed45b3 1ee45b3 000000 00  WA  0   0  1
  [ 5] .pci_fixup        PROGBITS        ffffffc009ed45c0 1ee45c0 0024c0 00   A  0   0 16
  [ 6] .builtin_fw       PROGBITS        ffffffc009ed6a80 1ee6a80 000000 00   A  0   0  8
  [ 7] __ksymtab         PROGBITS        ffffffc009ed6a80 1ee6a80 0084fc 00   A  0   0  4
  [ 8] __ksymtab_gpl     PROGBITS        ffffffc009edef7c 1eeef7c 00c660 00   A  0   0  4
  [ 9] __ksymtab_unused  PROGBITS        ffffffc009eeb5dc 1efb5dc 000000 00   A  0   0  1
  [10] __ksymtab_unused_gpl PROGBITS     ffffffc009eeb5dc 1efb5dc 000000 00   A  0   0  1
  [11] __ksymtab_gpl_future PROGBITS     ffffffc009eeb5dc 1efb5dc 000000 00   A  0   0  1
  [12] __kcrctab         PROGBITS        ffffffc009eeb5dc 1efb5dc 002c54 00   A  0   0  4
  [13] __kcrctab_gpl     PROGBITS        ffffffc009eee230 1efe230 004220 00   A  0   0  1
  [14] __kcrctab_unused  PROGBITS        ffffffc009ef2450 1f02450 000000 00   A  0   0  1
  [15] __kcrctab_unused_gpl PROGBITS     ffffffc009ef2450 1f02450 000000 00   A  0   0  1
  [16] __kcrctab_gpl_future PROGBITS     ffffffc009ef2450 1f02450 000000 00   A  0   0  1
  [17] __ksymtab_strings PROGBITS        ffffffc009ef2450 1f02450 02816c 01 AMS  0   0  1
  [18] __init_rodata     PROGBITS        ffffffc009f1a5bc 1f2a5bc 000000 00   A  0   0  1
  [19] __param           PROGBITS        ffffffc009f1a5c0 1f2a5c0 003778 00   A  0   0  8
  [20] __modver          PROGBITS        ffffffc009f1dd38 1f2dd38 0000d8 00   A  0   0  8
  [21] __ex_table        PROGBITS        ffffffc009f1de10 1f2de10 002d68 00   A  0   0  8
  [22] .notes            NOTE            ffffffc009f20b78 1f30b78 00003c 00   A  0   0  4
  [23] .hyp.rodata       PROGBITS        ffffffc009f21000 1f31000 001000 00 WAMS  0   0  8
  [24] .init.text        PROGBITS        ffffffc009f30000 1f40000 05ec8c 00  AX  0   0  4
  [25] .exit.text        PROGBITS        ffffffc009f8ec8c 1f9ec8c 0082dc 00  AX  0   0  4
  [26] .altinstructions  PROGBITS        ffffffc009f96f68 1fa6f68 0cb5bc 00   A  0   0  1
  [27] .init.data        PROGBITS        ffffffc00a070000 2080000 015f35 00 WAMS  0   0 256
  [28] .data..percpu     PROGBITS        ffffffc00a086000 2096000 017898 00  WA  0   0 64
  [29] .hyp.data..percpu PROGBITS        ffffffc00a09e000 20ae000 000e88 00  WA  0   0 16
  [30] .hyp.reloc        PROGBITS        ffffffc00a09ee88 20aee88 000064 00   A  0   0  4
  [31] .rela.dyn         RELA            ffffffc00a09eef0 20aeef0 000078 18   A  0   0  8
  [32] .relr.dyn         ANDROID_RELR    ffffffc00a09ef68 20aef68 008f20 08   A  0   0  8
  [33] .data             PROGBITS        ffffffc00a0b0000 20c0000 1c0aa0 00  WA  0   0 4096
  [34] __bug_table       PROGBITS        ffffffc00a270aa0 2280aa0 0205b0 00  WA  0   0  4
  [35] .mmuoff.data.write PROGBITS       ffffffc00a291800 22a1800 000018 00  WA  0   0 2048
  [36] .mmuoff.data.read PROGBITS        ffffffc00a292000 22a2000 000008 00  WA  0   0  8
  [37] .pecoff_edata_padding PROGBITS    ffffffc00a292008 22a2008 0001f8 00  WA  0   0  1
  [38] .sbss             PROGBITS        ffffffc00a293000 22a2200 000000 00  WA  0   0  1
  [39] .bss              NOBITS          ffffffc00a293000 22a3000 0f7c94 00  WA  0   0 4096
  [40] .eh_frame         PROGBITS        ffffffc00a390000 22b0000 02ba94 00   A  0   0  8
  [41] .debug_aranges    PROGBITS        0000000000000000 22dba94 000a20 00      0   0  1
  [42] .debug_info       PROGBITS        0000000000000000 22dc4b4 1261748f 00      0   0  1
  [43] .debug_abbrev     PROGBITS        0000000000000000 148f3943 3ab68a 00      0   0  1
  [44] .debug_line       PROGBITS        0000000000000000 14c9efcd 15eff48 00      0   0  1
  [45] .debug_frame      PROGBITS        0000000000000000 1628ef18 25c2d0 00      0   0  8
  [46] .debug_str        PROGBITS        0000000000000000 164eb1e8 498de6 01  MS  0   0  1
  [47] .debug_loc        PROGBITS        0000000000000000 16983fce 4187c98 00      0   0  1
  [48] .debug_ranges     PROGBITS        0000000000000000 1ab0bc66 b21e80 00      0   0  1
  [49] .comment          PROGBITS        0000000000000000 1b62dae6 000116 01  MS  0   0  1
  [50] .symtab           SYMTAB          0000000000000000 1b62dc00 cb2320 18     51 510804  8
  [51] .strtab           STRTAB          0000000000000000 1c2dff20 581142 00      0   0  1
  [52] .shstrtab         STRTAB          0000000000000000 1c861062 00028b 00      0   0  1
Key to Flags:
  W (write), A (alloc), X (execute), M (merge), S (strings), I (info),
  L (link order), O (extra OS processing required), G (group), T (TLS),
  C (compressed), x (unknown), o (OS specific), E (exclude),
  R (retain), p (processor specific)

现在我们通过代码来计算这个值:

///arch/arm64/include/asm/memory.h
#define KIMAGE_VADDR		(MODULES_END)
#define MODULES_END		(MODULES_VADDR + MODULES_VSIZE)

//MODULES_VSIZE的大小
#define MODULES_VSIZE		(SZ_128M)
#define SZ_128M				0x08000000

// MODULES_VADDR大小
#define MODULES_VADDR		(KASAN_SHADOW_END)
// 此处判断是否设置了这两个defconfig确定,我从手机导出/proc/config.gz中的config文件确认这两个配置config均为not set
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
#define KASAN_SHADOW_OFFSET	_AC(CONFIG_KASAN_SHADOW_OFFSET, UL)
#define KASAN_SHADOW_END	((UL(1) << (64 - KASAN_SHADOW_SCALE_SHIFT)) \
					+ KASAN_SHADOW_OFFSET)
#define KASAN_THREAD_SHIFT	1
#else
#define KASAN_THREAD_SHIFT	0
#define KASAN_SHADOW_END	(_PAGE_END(VA_BITS_MIN))
#endif /* CONFIG_KASAN */

// VA_BITS_MIN的确定
#if VA_BITS > 48
#define VA_BITS_MIN		(48)
#else
#define VA_BITS_MIN		(VA_BITS)   //走到了这里,因为设置了CONFIG_ARM64_VA_BITS_39=y,所以VA_BITS_MIN=39
#endif

//_PAGE_END函数
// 这个函数的意思是根据传入的 va 值生成一个掩码,此掩码的特点是从最高位到va-1位都是1,其余位都是0
#define _PAGE_END(va)		(-(UL(1) << ((va) - 1)))

经过如上的分析此时内核的起始地址即为:
-(1)<<(39-1) + 0x0800 0000
= -(0x4000000000)+0x0800 0000 //此处的符号为补码,其结果在64位系统中为0xFFFFFFC0 0000 0000
= 0xFFFFFFC0 0000 0000 + 0x0800 0000
= 0xFFFFFFC008000000
这就和从vmlinux中读出来的一致了

llvm-readelf.exe -h D:\dump\vmlinux\vmlinux
ELF Header:
  Magic:   7f 45 4c 46 02 01 01 00 00 00 00 00 00 00 00 00
  Class:                             ELF64
  Data:                              2's complement, little endian
  Version:                           1 (current)
  OS/ABI:                            UNIX - System V
  ABI Version:                       0
  Type:                              DYN (Shared object file)
  Machine:                           AArch64
  Version:                           0x1
  Entry point address:               0xFFFFFFC008000000   //开始点至
  Start of program headers:          64 (bytes into file)
  Start of section headers:          478548720 (bytes into file)
  Flags:                             0x0
  Size of this header:               64 (bytes)
  Size of program headers:           56 (bytes)
  Number of program headers:         25
  Size of section headers:           64 (bytes)
  Number of section headers:         53
  Section header string table index: 52

二、head.S

/*
 * Kernel startup entry point.
 * ---------------------------
 *
 * The requirements are:
 *   MMU = off, D-cache = off, I-cache = on or off,
 *   x0 = physical address to the FDT blob.
 *
 * This code is mostly position independent so you call this at
 * __pa(PAGE_OFFSET).
 *
 * Note that the callee-saved registers are used for storing variables
 * that are useful before the MMU is enabled. The allocations are described
 * in the entry routines.
 */
 /*********************************************************************************
  *
  * ARMV8支持EL2和EL3,这些异常等级都可以引导Linux内核的运行;
  * Linux内核运行在EL1,
  * kernel启动条件的要求:
  * CPU:
  * 屏蔽CPU上所有的中断,比如清除PSTATE寄存器的DAIF域;
  * CPU必须处在EL2或非安全模式的EL1
  *
  * MMU和高速缓存:
  * 关闭MMU;
  * 关闭数据高速缓存;//清除内核镜像加载的地址范围的高速缓存,最简单办法,关闭缓存
  * 指令高速缓存可关闭或打开;//因为u-boot和内核指令代码不会重叠,缓存不会出错
  *
  * 其他:
  * X0寄存器指向设备树的物理地址;
  * 设置时钟,CNTFRQ和CNTVOFF寄存器;
  * 内存一致性;
  *
  * U-boot的作用是加载内核镜像到内存,跳转到kernel入口地址,即这里!
  ********************************************************************************/
	__HEAD
	/*
	 * DO NOT MODIFY. Image header expected by Linux boot-loaders.
	 */
	efi_signature_nop			// special NOP to identity as PE/COFF executable
	b	primary_entry			// branch to kernel start, magic     ///跳转到内核启动汇编代码入口
	.quad	0				// Image load offset from start of RAM, little-endian
	le64sym	_kernel_size_le			// Effective size of kernel image, little-endian
	le64sym	_kernel_flags_le		// Informative flags, little-endian
	.quad	0				// reserved
	.quad	0				// reserved
	.quad	0				// reserved
	.ascii	ARM64_IMAGE_MAGIC		// Magic number
	.long	.Lpe_header_offset		// Offset to the PE header.

	__EFI_PE_HEADER

	__INIT  ///以下代码处于".init.text"段

2.1 primary_entry

进入 Linux 内核,汇编部分主要完成以下工作:

SYM_CODE_START(primary_entry)
	bl	preserve_boot_args      ///保持启动参数到boot_args[]数组
	bl	init_kernel_el			// w0=cpu_boot_mode ///切换到EL1模式,以运行kernel
	adrp	x23, __PHYS_OFFSET
	and	x23, x23, MIN_KIMG_ALIGN - 1	// KASLR offset, defaults to 0
	bl	set_cpu_boot_mode_flag    ///设置set_cpu_boot_mode_flag全局变量
	bl	__create_page_tables      ///创建恒等映射页表,以及内核映像映射页表
	/*
	 * The following calls CPU setup code, see arch/arm64/mm/proc.S for
	 * details.
	 * On return, the CPU will be ready for the MMU to be turned on and
	 * the TCR will have been set.
	 */
	bl	__cpu_setup			// initialise processor  ///为打开MMU做一些处理器相关的初始化
	b	__primary_switch    ///启动MMU,并跳转到start_kernel()函数(进入内核的C语言部分)
SYM_CODE_END(primary_entry)

下面细看每个函数内容

2.1.1 preserve_boot_args

 ///把引导程序传递过来的参数x0~x3保存到boot_args[]数组中
SYM_CODE_START_LOCAL(preserve_boot_args)
	mov	x21, x0				// x21=FDT,x0设备树地址,暂存在x21

	adr_l	x0, boot_args			// record the contents of
	stp	x21, x1, [x0]			// x0 .. x3 at kernel entry
	stp	x2, x3, [x0, #16]       ///4个参数存入boot_args

	dmb	sy				// needed before dc ivac with
						// MMU off
						///保证后面__inval_dcache_area清除缓存前,执行完stp指令,保证参数保存完整性

	add	x1, x0, #0x20			// 4 x 8 bytes
	///x0为设备树地址,x1=32为长度,__inval_dcache_area使boot_args[]数组对应的高速缓存失效,并清除缓存
	b	dcache_inval_poc		// tail call
SYM_CODE_END(preserve_boot_args)

2.1.2 init_kernel_el

SYM_FUNC_START(init_kernel_el)
    mrs x0, CurrentEL
    cmp x0, #CurrentEL_EL2
    b.eq    init_el2

SYM_INNER_LABEL(init_el1, SYM_L_LOCAL)
    mov_q   x0, INIT_SCTLR_EL1_MMU_OFF /// 设置大小端
    msr sctlr_el1, x0
    isb                                /// 刷新流水线
    mov_q   x0, INIT_PSTATE_EL1        /// 屏蔽外部中断信号
    msr spsr_el1, x0
    msr elr_el1, lr                    /// 设置 el1 返回地址
    mov w0, #BOOT_CPU_MODE_EL1         /// 返回值,ARM64 当前运行等级 el1 
    eret

SYM_INNER_LABEL(init_el2, SYM_L_LOCAL)

2.1.3 set_cpu_boot_mode_flag

/*
 * Sets the __boot_cpu_mode flag depending on the CPU boot mode passed
 * in w0. See arch/arm64/include/asm/virt.h for more info.
 */
SYM_FUNC_START_LOCAL(set_cpu_boot_mode_flag)
    adr_l   x1, __boot_cpu_mode   /// 全局变量,存放本地 CPU 执行等级
    cmp w0, #BOOT_CPU_MODE_EL2
    b.ne    1f
    add x1, x1, #4              ///EL2, 存放在__boot_cpu_mode[1]
1:  str w0, [x1]            // This CPU has booted in EL1  ///w0 为 init_kernel_el 函数返回的当前 CPU 异常等级
    dmb sy                  /// 确保__boot_cpu_mode 数据完整刷回内存;dc  ivac, x1            // Invalidate potentially stale cache line
    ret
SYM_FUNC_END(set_cpu_boot_mode_flag)

2.1.4 __create_page_tables

创建恒等映射页表,以及内核映像映射页表,这部分再下一个大章节中详细描述

三、内核启动开始阶段的内存映射

  1. CPU启动时,MMU是关闭的,CPU访问的是物理地址,而MMU开启后,访问的是虚拟地址;
  2. 现代处理器大多支持多级流水线,处理器会提前预取多条指令到流水线中, 当打开MMU时,CPU已经预取多条指令到流水线中,并且这些指令都是用物理地址预取的;MMU开启后,将以虚拟地址访问,这样继续访问流水线中预取的指令 (按物理地址预取),就很容易出错;{% u 为解决这个问题,引入“ 恒等映射”,即将虚拟地址映射到相等的物理地址 %},可以巧妙的解决上述问题;这里建立的恒等映射是小范围的,一般内核镜像占用的空间就几M;恒等映射完毕,开启MMU,CPU进入虚拟地址访问阶段;

3.1内存恒等映射

3.1.1 __create_page_tables

SYM_FUNC_START_LOCAL(__create_page_tables)
	mov	x28, lr  ///汇编函数由多级跳转,保存返回地址

	/*
	 * Invalidate the init page tables to avoid potential dirty cache lines
	 * being evicted. Other page tables are allocated in rodata as part of
	 * the kernel image, and thus are clean to the PoC per the boot
	 * protocol.
	 */

	/*
	 * 页表地址在vmlinux.lds.S定义,大小为INIT_DIR_SIZE
	 * init_pg_dir = .;
	 * . += INIT_DIR_SIZE;
	 * init_pg_end = .;
	 */

	adrp	x0, init_pg_dir
	adrp	x1, init_pg_end
	bl	dcache_inval_poc	///init_pg_dir页表高速缓存,失效

	/*
	 * Clear the init page tables. 初始化页表清零
	 */
	adrp	x0, init_pg_dir
	adrp	x1, init_pg_end
	sub	x1, x1, x0
1:	stp	xzr, xzr, [x0], #16
	stp	xzr, xzr, [x0], #16
	stp	xzr, xzr, [x0], #16
	stp	xzr, xzr, [x0], #16
	subs	x1, x1, #64
	b.ne	1b

	mov	x7, SWAPPER_MM_MMUFLAGS  ///描述段内存属性,普通内存,块映射,访问权限,共享属性

	/*
	 * Create the identity mapping.
	 */

	/*
	 * 在vmlinux.lds.S定义,大小为IDMAP_DIR_SIZE,通常为3个连续4KB页面,分别对应PGD,PUD和PMD页表
	 * 这里要建立一个2MB大小的块映射
	 * idmap_pg_dir = .;
	 * . += IDMAP_DIR_SIZE;
	 * idmap_pg_end = .;
	 */
     // 查询我司flame项目代码IDMAP_DIR_SIZE为 ((48-4)/(12-3)-1 )*4kb= 3*4kb

	adrp	x0, idmap_pg_dir

	///.idmap.text段的起始地址,除了开机启动时打开MMU外,内核还有许多场景需要恒等映射,如唤醒处理器的函数cpu_do_resume
	adrp	x3, __idmap_text_start		// __pa(__idmap_text_start)

#ifdef CONFIG_ARM64_VA_BITS_52
	mrs_s	x6, SYS_ID_AA64MMFR2_EL1
	and	x6, x6, #(0xf << ID_AA64MMFR2_LVA_SHIFT)
	mov	x5, #52
	cbnz	x6, 1f
#endif
	mov	x5, #VA_BITS_MIN  ///我司flame项目VA_BITS_MIN=39
1:
	adr_l	x6, vabits_actual
	str	x5, [x6]        ///VA_BITS_MIN的值保存在全局变量vabits_actual中
	dmb	sy              //保证str指令数据刷新到内存
	dc	ivac, x6		// Invalidate potentially stale cache line

	/*
	 * VA_BITS may be too small to allow for an ID mapping to be created
	 * that covers system RAM if that is located sufficiently high in the
	 * physical address space. So for the ID map, use an extended virtual
	 * range in that case, and configure an additional translation level
	 * if needed.
	 *
	 * Calculate the maximum allowed value for TCR_EL1.T0SZ so that the
	 * entire ID map region can be mapped. As T0SZ == (64 - #bits used),
	 * this number conveniently equals the number of leading zeroes in
	 * the physical address of __idmap_text_end.
	 */
	adrp	x5, __idmap_text_end
	clz	x5, x5 ///统计x5第一个1前由多少个0
	cmp	x5, TCR_T0SZ(VA_BITS_MIN) // default T0SZ small enough?
	b.ge	1f			// .. then skip VA range extension ///__idmap_text_end没超过VA_BITS_MIN表达的范围,跳转1f

	adr_l	x6, idmap_t0sz
	str	x5, [x6]
	dmb	sy
	dc	ivac, x6		// Invalidate potentially stale cache line

#if (VA_BITS < 48)
#define EXTRA_SHIFT	(PGDIR_SHIFT + PAGE_SHIFT - 3)
#define EXTRA_PTRS	(1 << (PHYS_MASK_SHIFT - EXTRA_SHIFT))

	/*
	 * If VA_BITS < 48, we have to configure an additional table level.
	 * First, we have to verify our assumption that the current value of
	 * VA_BITS was chosen such that all translation levels are fully
	 * utilised, and that lowering T0SZ will always result in an additional
	 * translation level to be configured.
	 */
#if VA_BITS != EXTRA_SHIFT
#error "Mismatch between VA_BITS and page size/number of translation levels"
#endif

	mov	x4, EXTRA_PTRS
	create_table_entry x0, x3, EXTRA_SHIFT, x4, x5, x6
#else
	/*
	 * If VA_BITS == 48, we don't have to configure an additional
	 * translation level, but the top-level table has more entries.
	 */
	mov	x4, #1 << (PHYS_MASK_SHIFT - PGDIR_SHIFT)
	str_l	x4, idmap_ptrs_per_pgd, x5
#endif
1:
	ldr_l	x4, idmap_ptrs_per_pgd   //idmap_ptrs_per_pgd等于PTRS_PER_PGD,表示PGD页表由多少个页表项
	adr_l	x6, __idmap_text_end		// __pa(__idmap_text_end)

	///调用map_memory宏建立__idmap_text代码段的映射页表;
/*
 * x0:idmap_pg_dir
 * x1:
 * x3:__idmap_text_start
 * x6: __idmap_text_end
 * x7: SWAPPER_MM_MMUFLAGS
 * x3: __idmap_text_start
 * x4: PTRS_PER_PGD
*/
	map_memory x0, x1, x3, x6, x7, x3, x4, x10, x11, x12, x13, x14

SYM_FUNC_END(__create_page_tables)

3.2 粗粒度内核镜像映射

3.2.1 map_memory

/* tbl:页表起始地址,页表基地址
 * rtbl:下一级页表起始地址,通常是tbl+PAGE_SIZE
 * vstart:要映射的虚拟地址的起始地址
 * vend:要映射的虚拟地址的结束地址
 * flags:最后一级页表的属性
 * phys:映射的物理地址
 * pgds:PGD页表项的个数
 */
	.macro map_memory, tbl, rtbl, vstart, vend, flags, phys, pgds, istart, iend, tmp, count, sv
	sub \vend, \vend, #1
	add \rtbl, \tbl, #PAGE_SIZE
	mov \sv, \rtbl
	mov \count, #0

	///compute_indices宏计算vstart,vend在页表中的索引值
	compute_indices \vstart, \vend, #PGDIR_SHIFT, \pgds, \istart, \iend, \count
	///设置页表内容,分别填充一级页表PGD,二级页表PMD, 最后一级页表PT
	populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp
	mov \tbl, \sv
	mov \sv, \rtbl

#if SWAPPER_PGTABLE_LEVELS > 3
	compute_indices \vstart, \vend, #PUD_SHIFT, #PTRS_PER_PUD, \istart, \iend, \count
	populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp
	mov \tbl, \sv
	mov \sv, \rtbl
#endif

#if SWAPPER_PGTABLE_LEVELS > 2
	compute_indices \vstart, \vend, #SWAPPER_TABLE_SHIFT, #PTRS_PER_PMD, \istart, \iend, \count
	populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp
	mov \tbl, \sv
#endif

	compute_indices \vstart, \vend, #SWAPPER_BLOCK_SHIFT, #PTRS_PER_PTE, \istart, \iend, \count
	bic \count, \phys, #SWAPPER_BLOCK_SIZE - 1
	populate_entries \tbl, \count, \istart, \iend, \flags, #SWAPPER_BLOCK_SIZE, \tmp
	.endm

3.2.2 compute_indices

/**************************************************************
 * func:计算vstart,vend在页表的索引值,返回值填在istart,iend
 *
 * vstart:虚拟地址的起始地址
 * vend:虚拟地址结束地址;
 * shift各级页表在虚拟地址中的偏移;
 * ptrs:页表项的个数;
 * istart:vstart索引值;
 * iend:vend索引值;
 * count:页表项个数
 **************************************************************/

	.macro compute_indices, vstart, vend, shift, ptrs, istart, iend, count
	lsr	\iend, \vend, \shift
	mov	\istart, \ptrs
	sub	\istart, \istart, #1
	and	\iend, \iend, \istart	// iend = (vend >> shift) & (ptrs - 1) iend索引值
	mov	\istart, \ptrs
	mul	\istart, \istart, \count
	add	\iend, \iend, \istart	// iend += count * ptrs
					// our entries span multiple tables
					//跨多个表

	lsr	\istart, \vstart, \shift
	mov	\count, \ptrs
	sub	\count, \count, #1
	and	\istart, \istart, \count ///istart索引值istart = (vstart >> shift) & (ptrs - 1)

	sub	\count, \iend, \istart  ///页表项个数
	.endm

3.2.3 populate_entries

/*******************************************************************
     * 填写页表
     *
     *  tbl:    页表基地址
     *  rtbl:   下级页表基地址
     *  index:  写入页表的起始索引
     *  eindex: 页表结束索引
     *  flags:  页表属性
     *  inc:  
     *  tmp1:   temporary variable
     *********************************************************************/

    .macro populate_entries, tbl, rtbl, index, eindex, flags, inc, tmp1
.Lpe\@: phys_to_pte \tmp1, \rtbl
    orr \tmp1, \tmp1, \flags    // tmp1 = table entry
    str \tmp1, [\tbl, \index, lsl #3] ///
    add \rtbl, \rtbl, \inc  // rtbl = pa next level /// 这里我理解为 rtbl 的下一个页(简单理解为相邻下个物理页),而不是下一级,跟注释有点不同?add \index, \index, #1
    cmp \index, \eindex     /// 判断是否填充完,未完则继续填写下一个
    b.ls    .Lpe\@
    .endm

综上,.idmap.text 段的虚拟地址映射到了相同的物理地址上,这个映射表在 idmap_pg_dir 中;
{% tip success %}
问题:那些函数在这个映射的 2MB 内存中?
由 head.s 中的定义知
.section .idmap.text,awx

__enable_mmu, __primary_switch, __cput_setup 等汇编函数都在.idmap.text 段中;可以从 System.map 文件中得到验证;这些函数在 Linux“自举”过程中会用到;
{% endtip %}

{% tip success %}
问题:为什么要使用map_memory创建第二个页表?
CPU 刚启动时,物理内存一般都在低地址(不会超过 256T 大小),恒等映射的地址实际在用户空间了,即 MMU 启用后 idmap_pg_dir 会填入 TTBR0; 而内核空间的链接地址都是在高地址(内核空间在高地址),需要填入 TTBR1;因此,这里再建一张表,映射整个内核镜像,且虚拟地址空间是在高地址区 0xffffxxxx xxxx xxxx
{% endtip %}

/*
     * Map the kernel image (starting with PHYS_OFFSET).
     */

    /// 调用 map_memory 宏建立整个内核镜像代码段  的映射页表;/**************************************************************************
     * 为什么要建第二张表?* CPU 刚启动时,物理内存一般都在低地址(不会超过 256T 大小),恒等映射的地址实际在用户空间了,* 即 MMU 启用后 idmap_pg_dir 会填入 TTBR0;
     * 而内核空间的链接地址都是在高地址(内核空间在高地址),需要填入 TTBR1;* 因此,这里再建一张表,映射整个内核镜像,且虚拟地址空间是在高地址区 0xffffxxxx xxxx xxxx

     * 注:init_pg_dir 和 idmap_pg_dir 两个页表映射区别:* (1)init_pg_dir 映射的虚拟地址在高位 0xffff xxxx xxxx xxxx;
     *   idmap_pg_dir 映射的虚拟地址在低位 0x0000 xxxx xxxx xxxx;
     *   MMU 启用后,init_pg_dir 填入 TTBR1,idmap_pg_dir 填入 TTBR0;* (2)init_pg_dir 映射大小是整个内核镜像,idmap_pg_dir 映射 2M, 只是内存访问过渡,成功开启 MMU 即可;***************************************************************************/
    adrp    x0, init_pg_dir
    mov_q   x5, KIMAGE_VADDR        // compile time __va(_text)
    add x5, x5, x23         // add KASLR displacement
    mov x4, PTRS_PER_PGD
    adrp    x6, _end            // runtime __pa(_end)
    adrp    x3, _text           // runtime __pa(_text)
    sub x6, x6, x3          // _end - _text
    add x6, x6, x5          // runtime __va(_end)

    map_memory x0, x1, x5, x6, x7, x3, x4, x10, x11, x12, x13, x14

四、__cpu_setup 函数

为打开MMU做一些处理器相关的初始化

/*
 *  __cpu_setup
 *
 *  Initialise the processor for turning the MMU on.
 *
 * Output:
 *  Return in x0 the value of the SCTLR_EL1 register.
 */
    .pushsection .idmap.text, awx  /// 把__cpu_setup 连接到.idmap.text 段
SYM_FUNC_START(__cpu_setup)
    tlbi    vmalle1             // Invalidate local TLB  /// 本地 TLB 无效
    dsb nsh                                              /// 确保 tlbi 执行完

    mov x1, #3 << 20
    msr cpacr_el1, x1           // Enable FP/ASIMD       /// 设定 EL0,EL1 可以访问浮点单元,SIMD 单元
    mov x1, #1 << 12            // Reset mdscr_el1 and disable
    msr mdscr_el1, x1           // access to the DCC from EL0
    isb                 // Unmask debug exceptions now,
    enable_dbg              // since this is per-cpu   /// 打开 PSATE 调试功能
    reset_pmuserenr_el0 x1          // Disable PMU access from EL0
    reset_amuserenr_el0 x1          // Disable AMU access from EL0

    /*
     * Default values for VMSA control registers. These will be adjusted
     * below depending on detected CPU features.
     */
    mair    .req    x17
    tcr .req    x16
    mov_q   mair, MAIR_EL1_SET

    /// 设置 TCR 寄存器,TCR 用于管理页表映射
    mov_q   tcr, TCR_TxSZ(VA_BITS) | TCR_CACHE_FLAGS | TCR_SMP_FLAGS | \
            TCR_TG_FLAGS | TCR_KASLR_FLAGS | TCR_ASID16 | \
            TCR_TBI0 | TCR_A1 | TCR_KASAN_SW_FLAGS

...
    tcr_clear_errata_bits tcr, x9, x5

#ifdef CONFIG_ARM64_VA_BITS_52
    ldr_l       x9, vabits_actual
    sub     x9, xzr, x9
    add     x9, x9, #64
    tcr_set_t1sz    tcr, x9
#else
    ldr_l       x9, idmap_t0sz  
#endif
    tcr_set_t0sz    tcr, x9

    /*
     * Set the IPS bits in TCR_EL1.
     */
    tcr_compute_pa_size tcr, #TCR_IPS_SHIFT, x5, x6  ///IPS 域,设置位宽
#ifdef CONFIG_ARM64_HW_AFDBM
    /*
     * Enable hardware update of the Access Flags bit.
     * Hardware dirty bit management is enabled later,
     * via capabilities.
     */
    mrs x9, ID_AA64MMFR1_EL1
    and x9, x9, #0xf
    cbz x9, 1f
    orr tcr, tcr, #TCR_HA       // hardware Access flag update
1:
#endif /* CONFIG_ARM64_HW_AFDBM */
    msr mair_el1, mair
    msr tcr_el1, tcr
    /*
     * Prepare SCTLR
     */
    mov_q   x0, INIT_SCTLR_EL1_MMU_ON /// 返回值,下个函数__enable_mmu 的参数
    ret                 // return to head.S

    .unreq  mair
    .unreq  tcr
SYM_FUNC_END(__cpu_setup)

五、__primary_switch函数

启动MMU,并跳转到start_kernel()函数(进入内核的C语言部分)

SYM_FUNC_START_LOCAL(__primary_switch)
#ifdef CONFIG_RANDOMIZE_BASE   /// 内核启动时对内核映像的虚拟地址重新映射,防止黑客攻击
    mov x19, x0             // preserve new SCTLR_EL1 value
    mrs x20, sctlr_el1          // preserve old SCTLR_EL1 value
#endif

    adrp    x1, init_pg_dir
    bl  __enable_mmu           /// 参数 x0->SCTLR_EL1,x1->init_pg_dir 页表基地址,开启 MMU
#ifdef CONFIG_RELOCATABLE      /// 配置重新映射内核镜像
#ifdef CONFIG_RELR
    mov x24, #0             // no RELR displacement yet
#endif
    bl  __relocate_kernel
#ifdef CONFIG_RANDOMIZE_BASE
    ldr x8, =__primary_switched
    adrp    x0, __PHYS_OFFSET
    blr x8

    /*
     * If we return here, we have a KASLR displacement in x23 which we need
     * to take into account by discarding the current kernel mapping and
     * creating a new one.
     */
    pre_disable_mmu_workaround
    msr sctlr_el1, x20          // disable the MMU
    isb
    bl  __create_page_tables        // recreate kernel mapping

    tlbi    vmalle1             // Remove any stale TLB entries
    dsb nsh
    isb

    set_sctlr_el1   x19         // re-enable the MMU

    bl  __relocate_kernel
#endif
#endif
    ldr x8, =__primary_switched
    adrp    x0, __PHYS_OFFSET
    br  x8                      /// 实现重映射
SYM_FUNC_END(__primary_switch)

5.1 __enable_mmu函数

/*
 * Enable the MMU.
 *
 *  x0  = SCTLR_EL1 value for turning on the MMU.
 *  x1  = TTBR1_EL1 value
 *
 * Returns to the caller via x30/lr. This requires the caller to be covered
 * by the .idmap.text section.
 *
 * Checks if the selected granule size is supported by the CPU.
 * If it isn't, park the CPU
 */
SYM_FUNC_START(__enable_mmu)
	mrs	x2, ID_AA64MMFR0_EL1
	ubfx	x2, x2, #ID_AA64MMFR0_TGRAN_SHIFT, 4
	cmp     x2, #ID_AA64MMFR0_TGRAN_SUPPORTED_MIN
	b.lt    __no_granule_support
	cmp     x2, #ID_AA64MMFR0_TGRAN_SUPPORTED_MAX
	b.gt    __no_granule_support
	update_early_cpu_boot_status 0, x2, x3
	adrp	x2, idmap_pg_dir
	phys_to_ttbr x1, x1
	phys_to_ttbr x2, x2
	msr	ttbr0_el1, x2			// load TTBR0
	offset_ttbr1 x1, x3
	msr	ttbr1_el1, x1			// load TTBR1   //填充两个页表基地址到TTBR0,TTBR1
	isb

	set_sctlr_el1	x0          //填充M域,使能MMU

	ret
SYM_FUNC_END(__enable_mmu)

5.2 __primary_switched函数

SYM_FUNC_START_LOCAL(__primary_switched)
	///已开启mmu,这里开始访问的都是虚拟地址,
	///比如init_task静态定义的虚拟地址,已经映射到对应物理内存地址
	adr_l	x4, init_task           ///init_thread_union指向thread_union数据结构,其中包含系统第一个进程(init进程)的内核栈

	init_cpu_task x4, x5, x6

	adr_l	x8, vectors			// load VBAR_EL1 with virtual	//将异常向量表的地址传入X8寄存器
	msr	vbar_el1, x8			// vector table address			//将x8寄存器内的异常向量表地址写入vbar_el1
	isb

	stp	x29, x30, [sp, #-16]!
	mov	x29, sp

    ///保存设备树的地址
	str_l	x21, __fdt_pointer, x5		// Save FDT pointer

	ldr_l	x4, kimage_vaddr		// Save the offset between
	sub	x4, x4, x0			// the kernel virtual and
	str_l	x4, kimage_voffset, x5		// physical mappings

	// Clear BSS
	///清除未初始化的数据段
	adr_l	x0, __bss_start
	mov	x1, xzr
	adr_l	x2, __bss_stop
	sub	x2, x2, x0
	bl	__pi_memset
	dsb	ishst				// Make zero page visible to PTW

#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
	bl	kasan_early_init
#endif
	mov	x0, x21				// pass FDT address in x0
	bl	early_fdt_map			// Try mapping the FDT early
	bl	init_feature_override		// Parse cpu feature overrides
#ifdef CONFIG_RANDOMIZE_BASE
	tst	x23, ~(MIN_KIMG_ALIGN - 1)	// already running randomized?
	b.ne	0f
	bl	kaslr_early_init		// parse FDT for KASLR options
	cbz	x0, 0f				// KASLR disabled? just proceed
	orr	x23, x23, x0			// record KASLR offset
	ldp	x29, x30, [sp], #16		// we must enable KASLR, return
	ret					// to __primary_switch()
0:
#endif
	bl	switch_to_vhe			// Prefer VHE if possible
	ldp	x29, x30, [sp], #16    //sp指向内核栈顶?
	bl	start_kernel           //跳转到C语言入口
	ASM_BUG()
SYM_FUNC_END(__primary_switched)

从此刻开始,汇编结束了,内核开启了C语言的世界。。。