[linux内存管理] 第013篇 zone的初始化

0. 前言

紧接前文[linux内存管理] 第012篇物理内存管理三大结构体之zone，本文将介绍zone的初始化，这也是继bootmem_init函数的后半部分（前半部分为sparse_init）。这次让我们来到下半部分吧，下半部分主要是围绕zone_sizes_init函数展开。

start_kernel()                 //init/main.c
    ---->setup_arch()          //arch/arm64/kernel/setup.c
        ---->bootmem_init()    //arch/arm64/mm/init.c
            ---->memblocks_present()  //mm/sparse.c
            ---->sparse_init()        //mm/sparse.c
            ---->zone_sizes_init()
```c
start_kernel()                 //init/main.c
    ---->setup_arch()          //arch/arm64/kernel/setup.c
        ---->bootmem_init()    //arch/arm64/mm/init.c
            ---->memblocks_present()  //mm/sparse.c
            ---->sparse_init()        //mm/sparse.c
            ---->zone_sizes_init()

在剖析该函数之前，首先需要注意参数：

arch/arm64/mm/init.c
 
void __init bootmem_init(void)
{
	unsigned long min, max;
 
	min = PFN_UP(memblock_start_of_DRAM());
	max = PFN_DOWN(memblock_end_of_DRAM());
 
	...
	zone_sizes_init(min, max);
 
	...
}
```c
arch/arm64/mm/init.c
 
void __init bootmem_init(void)
{
	unsigned long min, max;
 
	min = PFN_UP(memblock_start_of_DRAM());
	max = PFN_DOWN(memblock_end_of_DRAM());
 
	...
	zone_sizes_init(min, max);
 
	...
}

传入的 min 参数是 memblock.memory 的第一个 region 的 start PFN，传入的 max 参数是 memblock.memory 的最后一个 region 的 end PFN。也就是系统已知的最小可用内存和最大可用内存的 PFN。

下面来正式剖析 zone_sizes_init()：

1. zone_sizes_init

///计算每种类型zone的最大页帧号，然后初始化每个node中所有类型的zone
static void __init zone_sizes_init(unsigned long min, unsigned long max)
{
	unsigned long max_zone_pfns[MAX_NR_ZONES]  = {0};
	unsigned int __maybe_unused acpi_zone_dma_bits;
	unsigned int __maybe_unused dt_zone_dma_bits;
	phys_addr_t __maybe_unused dma32_phys_limit = max_zone_phys(32);

#ifdef CONFIG_ZONE_DMA
	acpi_zone_dma_bits = fls64(acpi_iort_dma_get_max_cpu_address());
	dt_zone_dma_bits = fls64(of_dma_get_max_cpu_address(NULL));
	zone_dma_bits = min3(32U, dt_zone_dma_bits, acpi_zone_dma_bits);
	///计算zone区域大小,这里默认是全部dram大小
	arm64_dma_phys_limit = max_zone_phys(zone_dma_bits);
	max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit);
#endif
#ifdef CONFIG_ZONE_DMA32
	max_zone_pfns[ZONE_DMA32] = PFN_DOWN(dma32_phys_limit);
	if (!arm64_dma_phys_limit)
		arm64_dma_phys_limit = dma32_phys_limit;
#endif
	if (!arm64_dma_phys_limit)
		arm64_dma_phys_limit = PHYS_MASK + 1;
	max_zone_pfns[ZONE_NORMAL] = max;

	///根据每个zone最大页帧号，初始化zone结构
	free_area_init(max_zone_pfns);
}
```c
///计算每种类型zone的最大页帧号，然后初始化每个node中所有类型的zone
static void __init zone_sizes_init(unsigned long min, unsigned long max)
{
	unsigned long max_zone_pfns[MAX_NR_ZONES]  = {0};
	unsigned int __maybe_unused acpi_zone_dma_bits;
	unsigned int __maybe_unused dt_zone_dma_bits;
	phys_addr_t __maybe_unused dma32_phys_limit = max_zone_phys(32);

#ifdef CONFIG_ZONE_DMA
	acpi_zone_dma_bits = fls64(acpi_iort_dma_get_max_cpu_address());
	dt_zone_dma_bits = fls64(of_dma_get_max_cpu_address(NULL));
	zone_dma_bits = min3(32U, dt_zone_dma_bits, acpi_zone_dma_bits);
	///计算zone区域大小,这里默认是全部dram大小
	arm64_dma_phys_limit = max_zone_phys(zone_dma_bits);
	max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit);
#endif
#ifdef CONFIG_ZONE_DMA32
	max_zone_pfns[ZONE_DMA32] = PFN_DOWN(dma32_phys_limit);
	if (!arm64_dma_phys_limit)
		arm64_dma_phys_limit = dma32_phys_limit;
#endif
	if (!arm64_dma_phys_limit)
		arm64_dma_phys_limit = PHYS_MASK + 1;
	max_zone_pfns[ZONE_NORMAL] = max;

	///根据每个zone最大页帧号，初始化zone结构
	free_area_init(max_zone_pfns);
}

下面我们来看下 UMA 架构下代码处理流程：

这个函数就是在计算各个zone的的最大页帧号，最终执行free_area_init函数。

2. free_area_init

void __init free_area_init(unsigned long *max_zone_pfn)
{
	unsigned long start_pfn, end_pfn;
	int i, nid, zone;
	bool descending;

	/* Record where the zone boundaries are */
	memset(arch_zone_lowest_possible_pfn, 0,
				sizeof(arch_zone_lowest_possible_pfn));
	memset(arch_zone_highest_possible_pfn, 0,
				sizeof(arch_zone_highest_possible_pfn));

	start_pfn = find_min_pfn_with_active_regions();
	descending = arch_has_descending_max_zone_pfns();

	for (i = 0; i < MAX_NR_ZONES; i++) {
		if (descending)
			zone = MAX_NR_ZONES - i - 1;
		else
			zone = i;

		if (zone == ZONE_MOVABLE)
			continue;

		end_pfn = max(max_zone_pfn[zone], start_pfn);
		arch_zone_lowest_possible_pfn[zone] = start_pfn;
		arch_zone_highest_possible_pfn[zone] = end_pfn;

		start_pfn = end_pfn;
	}

	/* Find the PFNs that ZONE_MOVABLE begins at in each node */
	memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
	find_zone_movable_pfns_for_nodes();

	/* Print out the zone ranges */
	pr_info("Zone ranges:\n");
	for (i = 0; i < MAX_NR_ZONES; i++) {
		if (i == ZONE_MOVABLE)
			continue;
		pr_info("  %-8s ", zone_names[i]);
		if (arch_zone_lowest_possible_pfn[i] ==
				arch_zone_highest_possible_pfn[i])
			pr_cont("empty\n");
		else
			pr_cont("[mem %#018Lx-%#018Lx]\n",
				(u64)arch_zone_lowest_possible_pfn[i]
					<< PAGE_SHIFT,
				((u64)arch_zone_highest_possible_pfn[i]
					<< PAGE_SHIFT) - 1);
	}

	/* Print out the PFNs ZONE_MOVABLE begins at in each node */
	pr_info("Movable zone start for each node\n");
	for (i = 0; i < MAX_NUMNODES; i++) {
		if (zone_movable_pfn[i])
			pr_info("  Node %d: %#018Lx\n", i,
			       (u64)zone_movable_pfn[i] << PAGE_SHIFT);
	}

	/*
	 * Print out the early node map, and initialize the
	 * subsection-map relative to active online memory ranges to
	 * enable future "sub-section" extensions of the memory map.
	 */
	pr_info("Early memory node ranges\n");
	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
		pr_info("  node %3d: [mem %#018Lx-%#018Lx]\n", nid,
			(u64)start_pfn << PAGE_SHIFT,
			((u64)end_pfn << PAGE_SHIFT) - 1);
		subsection_map_init(start_pfn, end_pfn - start_pfn);
	}

	/* Initialise every node */
	mminit_verify_pageflags_layout();
	setup_nr_node_ids();
	for_each_online_node(nid) {
		pg_data_t *pgdat = NODE_DATA(nid);
		///初始化nid节点
		free_area_init_node(nid);

		/* Any memory on that node */
		if (pgdat->node_present_pages)
			node_set_state(nid, N_MEMORY);
		check_for_memory(pgdat, nid);
	}

	///初始化struct page
	memmap_init();
}
```c
void __init free_area_init(unsigned long *max_zone_pfn)
{
	unsigned long start_pfn, end_pfn;
	int i, nid, zone;
	bool descending;

	/* Record where the zone boundaries are */
	memset(arch_zone_lowest_possible_pfn, 0,
				sizeof(arch_zone_lowest_possible_pfn));
	memset(arch_zone_highest_possible_pfn, 0,
				sizeof(arch_zone_highest_possible_pfn));

	start_pfn = find_min_pfn_with_active_regions();
	descending = arch_has_descending_max_zone_pfns();

	for (i = 0; i < MAX_NR_ZONES; i++) {
		if (descending)
			zone = MAX_NR_ZONES - i - 1;
		else
			zone = i;

		if (zone == ZONE_MOVABLE)
			continue;

		end_pfn = max(max_zone_pfn[zone], start_pfn);
		arch_zone_lowest_possible_pfn[zone] = start_pfn;
		arch_zone_highest_possible_pfn[zone] = end_pfn;

		start_pfn = end_pfn;
	}

	/* Find the PFNs that ZONE_MOVABLE begins at in each node */
	memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
	find_zone_movable_pfns_for_nodes();

	/* Print out the zone ranges */
	pr_info("Zone ranges:\n");
	for (i = 0; i < MAX_NR_ZONES; i++) {
		if (i == ZONE_MOVABLE)
			continue;
		pr_info("  %-8s ", zone_names[i]);
		if (arch_zone_lowest_possible_pfn[i] ==
				arch_zone_highest_possible_pfn[i])
			pr_cont("empty\n");
		else
			pr_cont("[mem %#018Lx-%#018Lx]\n",
				(u64)arch_zone_lowest_possible_pfn[i]
					<< PAGE_SHIFT,
				((u64)arch_zone_highest_possible_pfn[i]
					<< PAGE_SHIFT) - 1);
	}

	/* Print out the PFNs ZONE_MOVABLE begins at in each node */
	pr_info("Movable zone start for each node\n");
	for (i = 0; i < MAX_NUMNODES; i++) {
		if (zone_movable_pfn[i])
			pr_info("  Node %d: %#018Lx\n", i,
			       (u64)zone_movable_pfn[i] << PAGE_SHIFT);
	}

	/*
	 * Print out the early node map, and initialize the
	 * subsection-map relative to active online memory ranges to
	 * enable future "sub-section" extensions of the memory map.
	 */
	pr_info("Early memory node ranges\n");
	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
		pr_info("  node %3d: [mem %#018Lx-%#018Lx]\n", nid,
			(u64)start_pfn << PAGE_SHIFT,
			((u64)end_pfn << PAGE_SHIFT) - 1);
		subsection_map_init(start_pfn, end_pfn - start_pfn);
	}

	/* Initialise every node */
	mminit_verify_pageflags_layout();
	setup_nr_node_ids();
	for_each_online_node(nid) {
		pg_data_t *pgdat = NODE_DATA(nid);
		///初始化nid节点
		free_area_init_node(nid);

		/* Any memory on that node */
		if (pgdat->node_present_pages)
			node_set_state(nid, N_MEMORY);
		check_for_memory(pgdat, nid);
	}

	///初始化struct page
	memmap_init();
}

用于初始化内核的物理内存管理系统，为每个 NUMA 节点的内存区域（zone）设定边界、初始化数据结构，并完成内存分区的基本配置。

下面是详细解析：

2.1 初始化区域边界记录

memset(arch_zone_lowest_possible_pfn, 0, sizeof(arch_zone_lowest_possible_pfn));
memset(arch_zone_highest_possible_pfn, 0, sizeof(arch_zone_highest_possible_pfn));

```c
memset(arch_zone_lowest_possible_pfn, 0, sizeof(arch_zone_lowest_possible_pfn));
memset(arch_zone_highest_possible_pfn, 0, sizeof(arch_zone_highest_possible_pfn));

清零 arch_zone_lowest_possible_pfn 和 arch_zone_highest_possible_pfn，它们记录了每个内存区域的起始和结束页帧编号（PFN）。
页帧编号（PFN）是内存页的唯一标识，PFN = 物理地址 / 页大小。

2.2 确定起始PFN

start_pfn = find_min_pfn_with_active_regions();
descending = arch_has_descending_max_zone_pfns();

```C
start_pfn = find_min_pfn_with_active_regions();
descending = arch_has_descending_max_zone_pfns();

find_min_pfn_with_active_regions：找到最小的活跃页帧编号，作为初始化的起点。
arch_has_descending_max_zone_pfns：检查架构是否支持降序排列的内存区域布局。
- 某些架构可能从高地址区域开始分配，这会影响区域的初始化顺序。

2.3 初始化区域边界

for (i = 0; i < MAX_NR_ZONES; i++) {
    if (descending)
        zone = MAX_NR_ZONES - i - 1;
    else
        zone = i;

    if (zone == ZONE_MOVABLE)
        continue;

    end_pfn = max(max_zone_pfn[zone], start_pfn);
    arch_zone_lowest_possible_pfn[zone] = start_pfn;
    arch_zone_highest_possible_pfn[zone] = end_pfn;

    start_pfn = end_pfn;
}

```C
for (i = 0; i < MAX_NR_ZONES; i++) {
    if (descending)
        zone = MAX_NR_ZONES - i - 1;
    else
        zone = i;

    if (zone == ZONE_MOVABLE)
        continue;

    end_pfn = max(max_zone_pfn[zone], start_pfn);
    arch_zone_lowest_possible_pfn[zone] = start_pfn;
    arch_zone_highest_possible_pfn[zone] = end_pfn;

    start_pfn = end_pfn;
}

按架构要求的顺序（升序或降序）遍历所有内存区域（ZONE_DMA、ZONE_NORMAL 等）。
end_pfn 确定当前区域的结束页帧编号。
- max_zone_pfn[zone] 提供区域的最大可能页帧编号。
- 如果区域为空，则 start_pfn == end_pfn。
跳过 ZONE_MOVABLE，因为其边界会在后续单独计算。

2.4 计算 `ZONE_MOVABLE` 边界

memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
find_zone_movable_pfns_for_nodes();
```C
memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
find_zone_movable_pfns_for_nodes();

ZONE_MOVABLE 是专门为内存热插拔或动态内存管理设计的区域，它的起始 PFN 是动态计算的。
find_zone_movable_pfns_for_nodes：遍历所有节点，确定每个节点中 ZONE_MOVABLE 的起始 PFN。

2.5 打印区域信息

pr_info("Zone ranges:\n");
pr_info("Movable zone start for each node\n");
pr_info("Early memory node ranges\n");
```c
pr_info("Zone ranges:\n");
pr_info("Movable zone start for each node\n");
pr_info("Early memory node ranges\n");

打印每个内存区域的边界（起始和结束 PFN）。
打印 ZONE_MOVABLE 的起点信息。
打印内存节点的页帧编号范围。

2.6 初始化每个 NUMA 节点

for_each_online_node(nid) {
    pg_data_t *pgdat = NODE_DATA(nid);
    free_area_init_node(nid);

    if (pgdat->node_present_pages)
        node_set_state(nid, N_MEMORY);
    check_for_memory(pgdat, nid);
}
```c
for_each_online_node(nid) {
    pg_data_t *pgdat = NODE_DATA(nid);
    free_area_init_node(nid);

    if (pgdat->node_present_pages)
        node_set_state(nid, N_MEMORY);
    check_for_memory(pgdat, nid);
}

遍历所有在线 NUMA 节点，调用 free_area_init_node 对每个节点进行初始化。
- 设置 pgdat 的内存区域信息。
- 初始化每个区域的 free_area 数据结构（用于伙伴系统）。
如果节点有可用的内存页，设置其状态为 N_MEMORY。

注：free_area_init_node下面单独一章节讲解

2.7 初始化 struct page

memmap_init();
```c
memmap_init();

struct page 是物理页的抽象表示，每个页帧都有一个对应的 struct page。
memmap_init 用于对 struct page 结构进行初始化，分配并设置内存映射。

2.8 free_area_init总结

free_area_init 是 Linux 内存管理初始化的核心函数，通过划分区域、初始化节点、设置伙伴系统等操作，完成整个物理内存的管理准备工作。
它依赖于多个辅助函数和架构支持，根据系统配置动态调整行为。
该函数的正确执行是保证内核内存分配器（如 alloc_pages）正常工作的基础。

3. free_area_init_node

static void __init free_area_init_node(int nid)
{
	///获取内存节点
	pg_data_t *pgdat = NODE_DATA(nid);
	unsigned long start_pfn = 0;
	unsigned long end_pfn = 0;

	/* pg_data_t should be reset to zero when it's allocated */
	WARN_ON(pgdat->nr_zones || pgdat->kswapd_highest_zoneidx);

	///获取该节点的起始，结束帧
	get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);

	///若为uma系统，只有一个节点，nid=0
	pgdat->node_id = nid;
	pgdat->node_start_pfn = start_pfn;
	pgdat->per_cpu_nodestats = NULL;

	///node nid的物理内存地址范围
	pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
		(u64)start_pfn << PAGE_SHIFT,
		end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
	///计算spanned_pages, present_pages成员值
	calculate_node_totalpages(pgdat, start_pfn, end_pfn);

	///分配mem_map空间
	alloc_node_mem_map(pgdat);
	pgdat_set_deferred_range(pgdat);

	///节点核心成员初始化
	free_area_init_core(pgdat);
}
```c
static void __init free_area_init_node(int nid)
{
	///获取内存节点
	pg_data_t *pgdat = NODE_DATA(nid);
	unsigned long start_pfn = 0;
	unsigned long end_pfn = 0;

	/* pg_data_t should be reset to zero when it's allocated */
	WARN_ON(pgdat->nr_zones || pgdat->kswapd_highest_zoneidx);

	///获取该节点的起始，结束帧
	get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);

	///若为uma系统，只有一个节点，nid=0
	pgdat->node_id = nid;
	pgdat->node_start_pfn = start_pfn;
	pgdat->per_cpu_nodestats = NULL;

	///node nid的物理内存地址范围
	pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
		(u64)start_pfn << PAGE_SHIFT,
		end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
	///计算spanned_pages, present_pages成员值
	calculate_node_totalpages(pgdat, start_pfn, end_pfn);

	///分配mem_map空间
	alloc_node_mem_map(pgdat);
	pgdat_set_deferred_range(pgdat);

	///节点核心成员初始化
	free_area_init_core(pgdat);
}

重要函数说明

calculate_node_totalpages
计算节点的总页数并分类
- spanned_pages:节点覆盖的页数，包括无效或保留页
- present_pages:实际可用的页数
alloc_node_mem_map为节点的mem_map分配空间。

mem_map是一个struct page数组，每个物理页都有对应的描述符
free_area_init_core
初始化节点的核心管理结构，主要包括
- 设置各 zone 的范围
- 初始化伙伴系统的 free_area 数据结构

3.1 calculate_node_totalpages

static void __init calculate_node_totalpages(struct pglist_data *pgdat,
						unsigned long node_start_pfn,
						unsigned long node_end_pfn)
{
	unsigned long realtotalpages = 0, totalpages = 0;
	enum zone_type i;

	for (i = 0; i < MAX_NR_ZONES; i++) {
		struct zone *zone = pgdat->node_zones + i;
		unsigned long zone_start_pfn, zone_end_pfn;
		unsigned long spanned, absent;
		unsigned long size, real_size;

		spanned = zone_spanned_pages_in_node(pgdat->node_id, i,
						     node_start_pfn,
						     node_end_pfn,
						     &zone_start_pfn,
						     &zone_end_pfn);
		absent = zone_absent_pages_in_node(pgdat->node_id, i,
						   node_start_pfn,
						   node_end_pfn);

		size = spanned;
		real_size = size - absent;

		if (size)
			zone->zone_start_pfn = zone_start_pfn;
		else
			zone->zone_start_pfn = 0;
		zone->spanned_pages = size;
		zone->present_pages = real_size;
#if defined(CONFIG_MEMORY_HOTPLUG)
		zone->present_early_pages = real_size;
#endif

		totalpages += size;
		realtotalpages += real_size;
	}

	pgdat->node_spanned_pages = totalpages;
	pgdat->node_present_pages = realtotalpages;
	pr_debug("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
}
```c
static void __init calculate_node_totalpages(struct pglist_data *pgdat,
						unsigned long node_start_pfn,
						unsigned long node_end_pfn)
{
	unsigned long realtotalpages = 0, totalpages = 0;
	enum zone_type i;

	for (i = 0; i < MAX_NR_ZONES; i++) {
		struct zone *zone = pgdat->node_zones + i;
		unsigned long zone_start_pfn, zone_end_pfn;
		unsigned long spanned, absent;
		unsigned long size, real_size;

		spanned = zone_spanned_pages_in_node(pgdat->node_id, i,
						     node_start_pfn,
						     node_end_pfn,
						     &zone_start_pfn,
						     &zone_end_pfn);
		absent = zone_absent_pages_in_node(pgdat->node_id, i,
						   node_start_pfn,
						   node_end_pfn);

		size = spanned;
		real_size = size - absent;

		if (size)
			zone->zone_start_pfn = zone_start_pfn;
		else
			zone->zone_start_pfn = 0;
		zone->spanned_pages = size;
		zone->present_pages = real_size;
#if defined(CONFIG_MEMORY_HOTPLUG)
		zone->present_early_pages = real_size;
#endif

		totalpages += size;
		realtotalpages += real_size;
	}

	pgdat->node_spanned_pages = totalpages;
	pgdat->node_present_pages = realtotalpages;
	pr_debug("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
}

首先，通过传入 zone_size 调用 zone_spanned_pages_in_node() 统计zone 的：

start_pfn；
end_pfn；
zone 大小，目前返回的就是整个 zone 的物理内存大小；

然后，通过传入 zhole_size 调用 zone_absent_pages_in_node() 统计当前zone 的：

hole size，目前返回的就是之前计算的 hole 的内存大小；

通过该 zone 的 size 和 hole size，进而计算出该 zone 实际总内存 real_size(去掉 hole之后)。

经过 calculate_node_totalpages() 计算后，得到：

zone 的 zone_start_pfn，即zone 物理内存的起始页帧号；
zone 的 spanned_pages，即 zone 横跨的总内存；
zone 的 present_pages，即 zone 的实际总内存（去掉hole）；
node 的 spanned_pages，即 node 的总内存，所有 zone 总内存之和；
node 的 present_page，即node 的实际总内存，所有zone 总内存去掉zone 中的hole；

3.2 alloc_node_mem_map

创建node 的memmap，通常指的是没有使用 sparse 模型的情况下，这里暂不做剖析。

只需要知道这个函数受 CONFIG_FLAT_NODE_MEM_MAP 影响：

3.3 free_area_init_core

static void __init free_area_init_core(struct pglist_data *pgdat)
{
	enum zone_type j;
	int nid = pgdat->node_id;

	pgdat_init_internals(pgdat);
	pgdat->per_cpu_nodestats = &boot_nodestats;

	for (j = 0; j < MAX_NR_ZONES; j++) {
		struct zone *zone = pgdat->node_zones + j;
		unsigned long size, freesize, memmap_pages;

		size = zone->spanned_pages;
		freesize = zone->present_pages;

		pr_debug("---j=%d, size=%lu,freesize=%lu\n",j, size, freesize);
		/*
		 * Adjust freesize so that it accounts for how much memory
		 * is used by this zone for memmap. This affects the watermark
		 * and per-cpu initialisations
		 */
		memmap_pages = calc_memmap_size(size, freesize);
		if (!is_highmem_idx(j)) {
			if (freesize >= memmap_pages) {
				freesize -= memmap_pages;
				if (memmap_pages)
					pr_debug("---  %s zone: %lu pages used for memmap\n",
						 zone_names[j], memmap_pages);
			} else
				pr_warn("  %s zone: %lu memmap pages exceeds freesize %lu\n",
					zone_names[j], memmap_pages, freesize);
		}

		/* Account for reserved pages */
		if (j == 0 && freesize > dma_reserve) {
			freesize -= dma_reserve;
			pr_debug("  %s zone: %lu pages reserved\n", zone_names[0], dma_reserve);
		}

		if (!is_highmem_idx(j))
			nr_kernel_pages += freesize;
		/* Charge for highmem memmap if there are enough kernel pages */
		else if (nr_kernel_pages > memmap_pages * 2)
			nr_kernel_pages -= memmap_pages;
		nr_all_pages += freesize;

		/*
		 * Set an approximate value for lowmem here, it will be adjusted
		 * when the bootmem allocator frees pages into the buddy system.
		 * And all highmem pages will be managed by the buddy system.
		 */
		zone_init_internals(zone, j, nid, freesize);

		if (!size)
			continue;

		set_pageblock_order();
		///设置页块MIGRATE_TYPES类型,操作页块MIGRATE_TYPES类型函数
		///get_pageblock_migratetype
		///set_pageblock_migratetype
		setup_usemap(zone);
		///初始化free_area域
		init_currently_empty_zone(zone, zone->zone_start_pfn, size);
	}
}
```c
static void __init free_area_init_core(struct pglist_data *pgdat)
{
	enum zone_type j;
	int nid = pgdat->node_id;

	pgdat_init_internals(pgdat);
	pgdat->per_cpu_nodestats = &boot_nodestats;

	for (j = 0; j < MAX_NR_ZONES; j++) {
		struct zone *zone = pgdat->node_zones + j;
		unsigned long size, freesize, memmap_pages;

		size = zone->spanned_pages;
		freesize = zone->present_pages;

		pr_debug("---j=%d, size=%lu,freesize=%lu\n",j, size, freesize);
		/*
		 * Adjust freesize so that it accounts for how much memory
		 * is used by this zone for memmap. This affects the watermark
		 * and per-cpu initialisations
		 */
		memmap_pages = calc_memmap_size(size, freesize);
		if (!is_highmem_idx(j)) {
			if (freesize >= memmap_pages) {
				freesize -= memmap_pages;
				if (memmap_pages)
					pr_debug("---  %s zone: %lu pages used for memmap\n",
						 zone_names[j], memmap_pages);
			} else
				pr_warn("  %s zone: %lu memmap pages exceeds freesize %lu\n",
					zone_names[j], memmap_pages, freesize);
		}

		/* Account for reserved pages */
		if (j == 0 && freesize > dma_reserve) {
			freesize -= dma_reserve;
			pr_debug("  %s zone: %lu pages reserved\n", zone_names[0], dma_reserve);
		}

		if (!is_highmem_idx(j))
			nr_kernel_pages += freesize;
		/* Charge for highmem memmap if there are enough kernel pages */
		else if (nr_kernel_pages > memmap_pages * 2)
			nr_kernel_pages -= memmap_pages;
		nr_all_pages += freesize;

		/*
		 * Set an approximate value for lowmem here, it will be adjusted
		 * when the bootmem allocator frees pages into the buddy system.
		 * And all highmem pages will be managed by the buddy system.
		 */
		zone_init_internals(zone, j, nid, freesize);

		if (!size)
			continue;

		set_pageblock_order();
		///设置页块MIGRATE_TYPES类型,操作页块MIGRATE_TYPES类型函数
		///get_pageblock_migratetype
		///set_pageblock_migratetype
		setup_usemap(zone);
		///初始化free_area域
		init_currently_empty_zone(zone, zone->zone_start_pfn, size);
	}
}

3.3.1 pgdat_init_internals

static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
{
	pgdat_resize_init(pgdat);

	pgdat_init_split_queue(pgdat);
	pgdat_init_kcompactd(pgdat);

	init_waitqueue_head(&pgdat->kswapd_wait);
	init_waitqueue_head(&pgdat->pfmemalloc_wait);

	pgdat_page_ext_init(pgdat);
	lruvec_init(&pgdat->__lruvec);
}
```c
static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
{
	pgdat_resize_init(pgdat);

	pgdat_init_split_queue(pgdat);
	pgdat_init_kcompactd(pgdat);

	init_waitqueue_head(&pgdat->kswapd_wait);
	init_waitqueue_head(&pgdat->pfmemalloc_wait);

	pgdat_page_ext_init(pgdat);
	lruvec_init(&pgdat->__lruvec);
}

pgdat_resize_init()， 用以初始化 pgdat->node_size_lock 自旋锁；
pgdat_init_split_queue()， 针对 HUGEPAGE，需要使能CONFIG_TRANSPARENT_HUGEPAGE；
pgdat_init_kcompactd()， 初始化等待队列 pgdat->kcompactd_wait；
init_waitqueue_head()， 初始化等待队列 pgdat->kswapd_wait 和 pgdat->pfmemalloc_wait；
spin_lock_init(&pgdat->lru_lock)，初始化 lru_lock 自旋锁；
lruvec_init(&pgdat->__lruvec)，初始化 pgdat->__lruvec；

3.3.2 初始化per_cou_nodestats

pgdat->per_cpu_nodestats = &boot_nodestats;

pgdat->per_cpu_nodestats = &boot_nodestats;

指定全局变量 boot_nodestats，用以管理每个 cpu 的 node stat，详细可以查看 enum node_stat_item。

3.3.3 calc_memmap_size


static unsigned long __init calc_memmap_size(unsigned long spanned_pages,
						unsigned long present_pages)
{
	unsigned long pages = spanned_pages;

	/*
	 * Provide a more accurate estimation if there are holes within
	 * the zone and SPARSEMEM is in use. If there are holes within the
	 * zone, each populated memory region may cost us one or two extra
	 * memmap pages due to alignment because memmap pages for each
	 * populated regions may not be naturally aligned on page boundary.
	 * So the (present_pages >> 4) heuristic is a tradeoff for that.
	 */
	if (spanned_pages > present_pages + (present_pages >> 4) &&
	    IS_ENABLED(CONFIG_SPARSEMEM))
		pages = present_pages;

	return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
}
```c

static unsigned long __init calc_memmap_size(unsigned long spanned_pages,
						unsigned long present_pages)
{
	unsigned long pages = spanned_pages;

	/*
	 * Provide a more accurate estimation if there are holes within
	 * the zone and SPARSEMEM is in use. If there are holes within the
	 * zone, each populated memory region may cost us one or two extra
	 * memmap pages due to alignment because memmap pages for each
	 * populated regions may not be naturally aligned on page boundary.
	 * So the (present_pages >> 4) heuristic is a tradeoff for that.
	 */
	if (spanned_pages > present_pages + (present_pages >> 4) &&
	    IS_ENABLED(CONFIG_SPARSEMEM))
		pages = present_pages;

	return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
}

该函数用以更准确地统计每个zone 需要多少的 pages 来映射。

这里有个小的算法来选择 pages，当 hole_pages > present_pages/16，则会考虑到spanned_pages 中 hole 过多，而且 hole pages 不一定都是按照页对齐的，所以，需要映射的 pages 最大就可以选择 present_pages。

每个 page 是通过struct page 来管理的，所以，返回值是需要多少个页来分配所有的 struct page。函数调用完成后，函数返回值赋值给变量 memmap_pages，并且会打印出来：

[    0.000000] DMA32 zone: 484 pages used for memmap
[    0.000000] DMA32 zone: 0 pages reserved
[    0.000000] DMA32 zone: 30976 pages, LIFO batch:7
[    0.000000] Normal zone: 15116 pages used for memmap
[    0.000000] Normal zone: 952064 pages, LIFO batch:63

[    0.000000] DMA32 zone: 484 pages used for memmap
[    0.000000] DMA32 zone: 0 pages reserved
[    0.000000] DMA32 zone: 30976 pages, LIFO batch:7
[    0.000000] Normal zone: 15116 pages used for memmap
[    0.000000] Normal zone: 952064 pages, LIFO batch:63

从log 可以看到，DMA32 中 484 个页面用于memmap

3.3.4 统计nr_kernel_pages和nr_all_pages

当memmap_pages统计完成后，系统认为这些内存肯定无法再进行使用了，会将这部分内存扣除后，剩下来的内存记录到 nr_kernel_pages 和 nr_all_pages 中。

3.3.5 zone_init_internals()

在分析这个函数之前，需要注意最后一个参数 remaining_pages，是在 zone->present_pages 基础上减去 memmap_pages(详细看第 3.3.3 节) 和 dma_reserve(通常为0)。

另外，需要注意的是这里的 zone->managed_pages 是初期的值，会在 memblock_free_all 函数中被重置，并且在后面的 buddy 初始化中将 reserved 部分移除，得到可用于 buddy 系统管理的最终的 managed_pages。


static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
							unsigned long remaining_pages)
{
	atomic_long_set(&zone->managed_pages, remaining_pages);
	zone_set_nid(zone, nid);
	zone->name = zone_names[idx];
	zone->zone_pgdat = NODE_DATA(nid);
	spin_lock_init(&zone->lock);
	zone_seqlock_init(zone);
	zone_pcp_init(zone);
}
```cpp

static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
							unsigned long remaining_pages)
{
	atomic_long_set(&zone->managed_pages, remaining_pages);
	zone_set_nid(zone, nid);
	zone->name = zone_names[idx];
	zone->zone_pgdat = NODE_DATA(nid);
	spin_lock_init(&zone->lock);
	zone_seqlock_init(zone);
	zone_pcp_init(zone);
}

将 remaining_pages 存放到 zone->managed_pages 中；
对于UMA 来说 zone_set_nid() 无效，就一个node；
指定zone name，来自全局数组变量 zone_names；
指定zone 的 pgdat，指定隶属于那个zone管辖，后面操作过程中根据zone 可以轻松找到node，对于 UMA 来说，zone_pgdat 就是 contig_page_data；
初始化 zone 的lock 自旋锁；
zone_pcp_init()， 初始化zone 用于pcp 系统的pageset变量；

这里来看下 zone_pcp_init()：

static __meminit void zone_pcp_init(struct zone *zone)
{
	/*
	 * per cpu subsystem is not up at this point. The following code
	 * relies on the ability of the linker to provide the
	 * offset of a (static) per cpu variable into the per cpu area.
	 */
	zone->per_cpu_pageset = &boot_pageset;
	zone->per_cpu_zonestats = &boot_zonestats;
	zone->pageset_high = BOOT_PAGESET_HIGH;
	zone->pageset_batch = BOOT_PAGESET_BATCH;

	if (populated_zone(zone))
		pr_debug("  %s zone: %lu pages, LIFO batch:%u\n", zone->name,
			 zone->present_pages, zone_batchsize(zone));
}
```c
static __meminit void zone_pcp_init(struct zone *zone)
{
	/*
	 * per cpu subsystem is not up at this point. The following code
	 * relies on the ability of the linker to provide the
	 * offset of a (static) per cpu variable into the per cpu area.
	 */
	zone->per_cpu_pageset = &boot_pageset;
	zone->per_cpu_zonestats = &boot_zonestats;
	zone->pageset_high = BOOT_PAGESET_HIGH;
	zone->pageset_batch = BOOT_PAGESET_BATCH;

	if (populated_zone(zone))
		pr_debug("  %s zone: %lu pages, LIFO batch:%u\n", zone->name,
			 zone->present_pages, zone_batchsize(zone));
}

这里是zone 中 pageset 初始化地方，使用全局变量 boot_pageset 的地址；

boot_pageset 是一个Per-CPU 变量，每个 CPU 都有一个本地 struct per_cpu_pageset 数据结构，里面有个成员变量 struct per_cpu_pages 为每个 CPU 维护一个 order 为 0 的链表，详细数据结构可以查看上文第 2.1 节。

回到函数 zone_pcp_init()， 其中 zone_batchsize() 就是计算该zone 中 pcp list 中的 batch 变量值：

static int zone_batchsize(struct zone *zone)
{
#ifdef CONFIG_MMU
	int batch;

	/*
	 * The number of pages to batch allocate is either ~0.1%
	 * of the zone or 1MB, whichever is smaller. The batch
	 * size is striking a balance between allocation latency
	 * and zone lock contention.
	 */
	batch = min(zone_managed_pages(zone) >> 10, (1024 * 1024) / PAGE_SIZE);
	batch /= 4;		/* We effectively *= 4 below */
	if (batch < 1)
		batch = 1;

	/*
	 * Clamp the batch to a 2^n - 1 value. Having a power
	 * of 2 value was found to be more likely to have
	 * suboptimal cache aliasing properties in some cases.
	 *
	 * For example if 2 tasks are alternately allocating
	 * batches of pages, one task can end up with a lot
	 * of pages of one half of the possible page colors
	 * and the other with pages of the other colors.
	 */
	batch = rounddown_pow_of_two(batch + batch/2) - 1;

	return batch;

#else
	/* The deferral and batching of frees should be suppressed under NOMMU
	 * conditions.
	 *
	 * The problem is that NOMMU needs to be able to allocate large chunks
	 * of contiguous memory as there's no hardware page translation to
	 * assemble apparent contiguous memory from discontiguous pages.
	 *
	 * Queueing large contiguous runs of pages for batching, however,
	 * causes the pages to actually be freed in smaller chunks.  As there
	 * can be a significant delay between the individual batches being
	 * recycled, this leads to the once large chunks of space being
	 * fragmented and becoming unavailable for high-order allocations.
	 */
	return 0;
#endif
}.
```c
static int zone_batchsize(struct zone *zone)
{
#ifdef CONFIG_MMU
	int batch;

	/*
	 * The number of pages to batch allocate is either ~0.1%
	 * of the zone or 1MB, whichever is smaller. The batch
	 * size is striking a balance between allocation latency
	 * and zone lock contention.
	 */
	batch = min(zone_managed_pages(zone) >> 10, (1024 * 1024) / PAGE_SIZE);
	batch /= 4;		/* We effectively *= 4 below */
	if (batch < 1)
		batch = 1;

	/*
	 * Clamp the batch to a 2^n - 1 value. Having a power
	 * of 2 value was found to be more likely to have
	 * suboptimal cache aliasing properties in some cases.
	 *
	 * For example if 2 tasks are alternately allocating
	 * batches of pages, one task can end up with a lot
	 * of pages of one half of the possible page colors
	 * and the other with pages of the other colors.
	 */
	batch = rounddown_pow_of_two(batch + batch/2) - 1;

	return batch;

#else
	/* The deferral and batching of frees should be suppressed under NOMMU
	 * conditions.
	 *
	 * The problem is that NOMMU needs to be able to allocate large chunks
	 * of contiguous memory as there's no hardware page translation to
	 * assemble apparent contiguous memory from discontiguous pages.
	 *
	 * Queueing large contiguous runs of pages for batching, however,
	 * causes the pages to actually be freed in smaller chunks.  As there
	 * can be a significant delay between the individual batches being
	 * recycled, this leads to the once large chunks of space being
	 * fragmented and becoming unavailable for high-order allocations.
	 */
	return 0;
#endif
}
3
.

batch 为 zone->managed_pages 除以 1024，如果所得的batch 大于 256 时，batch 重新取值为 256；接着用新的batch 除以 4，再减去1.

如下面的log，对于NORMAL zone 来说，因为 managed_pages 过大，所以最终batch 取63：

[ 0.000000] DMA32 zone: 484 pages used for memmap
[ 0.000000] DMA32 zone: 0 pages reserved
[ 0.000000] DMA32 zone: 30976 pages, LIFO batch:7
[ 0.000000] Normal zone: 15116 pages used for memmap
[ 0.000000] Normal zone: 952064 pages, LIFO batch:63

注意：

这里的 batch 并没有存入到 zone 中，只是计算出 present_pages 之后进行的打印。

详细的初始化地方见函数 setup_per_cpu_pageset()（init/main.c->start_kernel() 中调用）。

3.3.6 set_pageblock_order

void __init set_pageblock_order(void)
{
	unsigned int order;

	/* Check that pageblock_nr_pages has not already been setup */
	if (pageblock_order)
		return;

	if (HPAGE_SHIFT > PAGE_SHIFT)
		order = HUGETLB_PAGE_ORDER;
	else
		order = MAX_ORDER - 1;

	/*
	 * Assume the largest contiguous order of interest is a huge page.
	 * This value may be variable depending on boot parameters on IA64 and
	 * powerpc.
	 */
	pageblock_order = order;
}

设置全局变量 pageblock_order，对于没有设定 CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 的架构体系来说，pageblock_order 使用默认值 (MAX_ORDER - 1)。

3.3.7 init_currently_empty_zone


void __meminit init_currently_empty_zone(struct zone *zone,
					unsigned long zone_start_pfn,
					unsigned long size)
{
	struct pglist_data *pgdat = zone->zone_pgdat;
	int zone_idx = zone_idx(zone) + 1;

	if (zone_idx > pgdat->nr_zones)
		pgdat->nr_zones = zone_idx;

	zone->zone_start_pfn = zone_start_pfn;

	mminit_dprintk(MMINIT_TRACE, "memmap_init",
			"Initialising map node %d zone %lu pfns %lu -> %lu\n",
			pgdat->node_id,
			(unsigned long)zone_idx(zone),
			zone_start_pfn, (zone_start_pfn + size));

	zone_init_free_lists(zone);
	zone->initialized = 1;
}

这里不用细讲，主要看函数里面的 zone_init_free_lists()：

static void __meminit zone_init_free_lists(struct zone *zone)
{
	unsigned int order, t;
	for_each_migratetype_order(order, t) {
		INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
		zone->free_area[order].nr_free = 0;
	}
}

终于看到了所谓的 zone 的 free list，也就是zone 中的成员变量 free_area数组，这也是buddy 系统的核心数据结构；

当 zone_init_free_lists() 调用完成后，会将 zone->initialized 置为1，标记 zone 的初始化完成。这个变量很关键，这里的初始化完成，并置 initialized 为1，在后面想要初始化 buddy，必须要这里置1才行。

3.3.8 memmap_init和 __init_single_page

static void __init memmap_init(void)
{
	unsigned long start_pfn, end_pfn;
	unsigned long hole_pfn = 0;
	int i, j, zone_id = 0, nid;

	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
		struct pglist_data *node = NODE_DATA(nid);

		for (j = 0; j < MAX_NR_ZONES; j++) {
			struct zone *zone = node->node_zones + j;

			if (!populated_zone(zone))
				continue;

			memmap_init_zone_range(zone, start_pfn, end_pfn,
					       &hole_pfn);
			zone_id = j;
		}
	}

#ifdef CONFIG_SPARSEMEM
	/*
	 * Initialize the memory map for hole in the range [memory_end,
	 * section_end].
	 * Append the pages in this hole to the highest zone in the last
	 * node.
	 * The call to init_unavailable_range() is outside the ifdef to
	 * silence the compiler warining about zone_id set but not used;
	 * for FLATMEM it is a nop anyway
	 */
	end_pfn = round_up(end_pfn, PAGES_PER_SECTION);
	if (hole_pfn < end_pfn)
#endif
		init_unavailable_range(hole_pfn, end_pfn, zone_id, nid);
}

在上一小节将 free_area 初始化完成后，zone初始化完成了，需要通过 memmap_init() 对所有page 进行初始化；通过pfn_to_page(pfn)，很容易找到物理page。

然后通过 __init_single_page() 对每个 page 进行单独的初始化：

static void __meminit __init_single_page(struct page *page, unsigned long pfn,
				unsigned long zone, int nid)
{
	mm_zero_struct_page(page);
	set_page_links(page, zone, nid, pfn);
	init_page_count(page);
	page_mapcount_reset(page);
	page_cpupid_reset_last(page);
	page_kasan_tag_reset(page);

	INIT_LIST_HEAD(&page->lru);
#ifdef WANT_PAGE_VIRTUAL
	/* The shift won't overflow because ZONE_NORMAL is below 4G. */
	if (!is_highmem_idx(zone))
		set_page_address(page, __va(pfn << PAGE_SHIFT));
#endif
}

首先将page 清零；

然后调用 set_page_links() 在 page->flags中初始化page所属的 zone 以及node id；

接着，将计数 _refcount 设置为1， _mapcount 设置为-1，初始化 _last_cpupid 以及初始化 lru；

注意：这个是page 数据结构首次初始化的地方，page 重要的数据成员 lru(struct list_head)也是在这里初始化；

下面是 __init_single_page() 函数大致流程图：

在 __init_single_page() 结束之后，通过 set_pageblock_migratetype() 将page block 的属性设为 MIGRATE_MOVABLE，如果后续需要申请其他类型的 page，则从MIGRATE_MOVABLE 进行转换；

至此，zone 的初始化过程就分析完成了，大致总结一个流程图如下：

2025 年 6 月
日	一	二	三	四	五	六
1	2	3	4	5	6	7
8	9	10	11	12	13	14
15	16	17	18	19	20	21
22	23	24	25	26	27	28
29	30