0. 前言
紧接前文[linux内存管理] 第012篇 物理内存管理三大结构体之zone,本文将介绍zone的初始化,这也是继bootmem_init
函数的后半部分(前半部分为sparse_init
)。这次让我们来到下半部分吧,下半部分主要是围绕zone_sizes_init
函数展开。
start_kernel() //init/main.c
---->setup_arch() //arch/arm64/kernel/setup.c
---->bootmem_init() //arch/arm64/mm/init.c
---->memblocks_present() //mm/sparse.c
---->sparse_init() //mm/sparse.c
---->zone_sizes_init()
```c
start_kernel() //init/main.c
---->setup_arch() //arch/arm64/kernel/setup.c
---->bootmem_init() //arch/arm64/mm/init.c
---->memblocks_present() //mm/sparse.c
---->sparse_init() //mm/sparse.c
---->zone_sizes_init()
在剖析该函数之前,首先需要注意参数:
arch/arm64/mm/init.c
void __init bootmem_init(void)
{
unsigned long min, max;
min = PFN_UP(memblock_start_of_DRAM());
max = PFN_DOWN(memblock_end_of_DRAM());
...
zone_sizes_init(min, max);
...
}
```c
arch/arm64/mm/init.c
void __init bootmem_init(void)
{
unsigned long min, max;
min = PFN_UP(memblock_start_of_DRAM());
max = PFN_DOWN(memblock_end_of_DRAM());
...
zone_sizes_init(min, max);
...
}
传入的 min 参数是 memblock.memory 的第一个 region 的 start PFN,传入的 max 参数是 memblock.memory 的最后一个 region 的 end PFN。也就是系统已知的最小可用内存和最大可用内存的 PFN。
下面来正式剖析 zone_sizes_init():
1. zone_sizes_init
///计算每种类型zone的最大页帧号,然后初始化每个node中所有类型的zone
static void __init zone_sizes_init(unsigned long min, unsigned long max)
{
unsigned long max_zone_pfns[MAX_NR_ZONES] = {0};
unsigned int __maybe_unused acpi_zone_dma_bits;
unsigned int __maybe_unused dt_zone_dma_bits;
phys_addr_t __maybe_unused dma32_phys_limit = max_zone_phys(32);
#ifdef CONFIG_ZONE_DMA
acpi_zone_dma_bits = fls64(acpi_iort_dma_get_max_cpu_address());
dt_zone_dma_bits = fls64(of_dma_get_max_cpu_address(NULL));
zone_dma_bits = min3(32U, dt_zone_dma_bits, acpi_zone_dma_bits);
///计算zone区域大小,这里默认是全部dram大小
arm64_dma_phys_limit = max_zone_phys(zone_dma_bits);
max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit);
#endif
#ifdef CONFIG_ZONE_DMA32
max_zone_pfns[ZONE_DMA32] = PFN_DOWN(dma32_phys_limit);
if (!arm64_dma_phys_limit)
arm64_dma_phys_limit = dma32_phys_limit;
#endif
if (!arm64_dma_phys_limit)
arm64_dma_phys_limit = PHYS_MASK + 1;
max_zone_pfns[ZONE_NORMAL] = max;
///根据每个zone最大页帧号,初始化zone结构
free_area_init(max_zone_pfns);
}
```c
///计算每种类型zone的最大页帧号,然后初始化每个node中所有类型的zone
static void __init zone_sizes_init(unsigned long min, unsigned long max)
{
unsigned long max_zone_pfns[MAX_NR_ZONES] = {0};
unsigned int __maybe_unused acpi_zone_dma_bits;
unsigned int __maybe_unused dt_zone_dma_bits;
phys_addr_t __maybe_unused dma32_phys_limit = max_zone_phys(32);
#ifdef CONFIG_ZONE_DMA
acpi_zone_dma_bits = fls64(acpi_iort_dma_get_max_cpu_address());
dt_zone_dma_bits = fls64(of_dma_get_max_cpu_address(NULL));
zone_dma_bits = min3(32U, dt_zone_dma_bits, acpi_zone_dma_bits);
///计算zone区域大小,这里默认是全部dram大小
arm64_dma_phys_limit = max_zone_phys(zone_dma_bits);
max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit);
#endif
#ifdef CONFIG_ZONE_DMA32
max_zone_pfns[ZONE_DMA32] = PFN_DOWN(dma32_phys_limit);
if (!arm64_dma_phys_limit)
arm64_dma_phys_limit = dma32_phys_limit;
#endif
if (!arm64_dma_phys_limit)
arm64_dma_phys_limit = PHYS_MASK + 1;
max_zone_pfns[ZONE_NORMAL] = max;
///根据每个zone最大页帧号,初始化zone结构
free_area_init(max_zone_pfns);
}
下面我们来看下 UMA
架构下代码处理流程:
这个函数就是在计算各个zone的的最大页帧号,最终执行free_area_init
函数。
2. free_area_init
void __init free_area_init(unsigned long *max_zone_pfn)
{
unsigned long start_pfn, end_pfn;
int i, nid, zone;
bool descending;
/* Record where the zone boundaries are */
memset(arch_zone_lowest_possible_pfn, 0,
sizeof(arch_zone_lowest_possible_pfn));
memset(arch_zone_highest_possible_pfn, 0,
sizeof(arch_zone_highest_possible_pfn));
start_pfn = find_min_pfn_with_active_regions();
descending = arch_has_descending_max_zone_pfns();
for (i = 0; i < MAX_NR_ZONES; i++) {
if (descending)
zone = MAX_NR_ZONES - i - 1;
else
zone = i;
if (zone == ZONE_MOVABLE)
continue;
end_pfn = max(max_zone_pfn[zone], start_pfn);
arch_zone_lowest_possible_pfn[zone] = start_pfn;
arch_zone_highest_possible_pfn[zone] = end_pfn;
start_pfn = end_pfn;
}
/* Find the PFNs that ZONE_MOVABLE begins at in each node */
memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
find_zone_movable_pfns_for_nodes();
/* Print out the zone ranges */
pr_info("Zone ranges:\n");
for (i = 0; i < MAX_NR_ZONES; i++) {
if (i == ZONE_MOVABLE)
continue;
pr_info(" %-8s ", zone_names[i]);
if (arch_zone_lowest_possible_pfn[i] ==
arch_zone_highest_possible_pfn[i])
pr_cont("empty\n");
else
pr_cont("[mem %#018Lx-%#018Lx]\n",
(u64)arch_zone_lowest_possible_pfn[i]
<< PAGE_SHIFT,
((u64)arch_zone_highest_possible_pfn[i]
<< PAGE_SHIFT) - 1);
}
/* Print out the PFNs ZONE_MOVABLE begins at in each node */
pr_info("Movable zone start for each node\n");
for (i = 0; i < MAX_NUMNODES; i++) {
if (zone_movable_pfn[i])
pr_info(" Node %d: %#018Lx\n", i,
(u64)zone_movable_pfn[i] << PAGE_SHIFT);
}
/*
* Print out the early node map, and initialize the
* subsection-map relative to active online memory ranges to
* enable future "sub-section" extensions of the memory map.
*/
pr_info("Early memory node ranges\n");
for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid,
(u64)start_pfn << PAGE_SHIFT,
((u64)end_pfn << PAGE_SHIFT) - 1);
subsection_map_init(start_pfn, end_pfn - start_pfn);
}
/* Initialise every node */
mminit_verify_pageflags_layout();
setup_nr_node_ids();
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
///初始化nid节点
free_area_init_node(nid);
/* Any memory on that node */
if (pgdat->node_present_pages)
node_set_state(nid, N_MEMORY);
check_for_memory(pgdat, nid);
}
///初始化struct page
memmap_init();
}
```c
void __init free_area_init(unsigned long *max_zone_pfn)
{
unsigned long start_pfn, end_pfn;
int i, nid, zone;
bool descending;
/* Record where the zone boundaries are */
memset(arch_zone_lowest_possible_pfn, 0,
sizeof(arch_zone_lowest_possible_pfn));
memset(arch_zone_highest_possible_pfn, 0,
sizeof(arch_zone_highest_possible_pfn));
start_pfn = find_min_pfn_with_active_regions();
descending = arch_has_descending_max_zone_pfns();
for (i = 0; i < MAX_NR_ZONES; i++) {
if (descending)
zone = MAX_NR_ZONES - i - 1;
else
zone = i;
if (zone == ZONE_MOVABLE)
continue;
end_pfn = max(max_zone_pfn[zone], start_pfn);
arch_zone_lowest_possible_pfn[zone] = start_pfn;
arch_zone_highest_possible_pfn[zone] = end_pfn;
start_pfn = end_pfn;
}
/* Find the PFNs that ZONE_MOVABLE begins at in each node */
memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
find_zone_movable_pfns_for_nodes();
/* Print out the zone ranges */
pr_info("Zone ranges:\n");
for (i = 0; i < MAX_NR_ZONES; i++) {
if (i == ZONE_MOVABLE)
continue;
pr_info(" %-8s ", zone_names[i]);
if (arch_zone_lowest_possible_pfn[i] ==
arch_zone_highest_possible_pfn[i])
pr_cont("empty\n");
else
pr_cont("[mem %#018Lx-%#018Lx]\n",
(u64)arch_zone_lowest_possible_pfn[i]
<< PAGE_SHIFT,
((u64)arch_zone_highest_possible_pfn[i]
<< PAGE_SHIFT) - 1);
}
/* Print out the PFNs ZONE_MOVABLE begins at in each node */
pr_info("Movable zone start for each node\n");
for (i = 0; i < MAX_NUMNODES; i++) {
if (zone_movable_pfn[i])
pr_info(" Node %d: %#018Lx\n", i,
(u64)zone_movable_pfn[i] << PAGE_SHIFT);
}
/*
* Print out the early node map, and initialize the
* subsection-map relative to active online memory ranges to
* enable future "sub-section" extensions of the memory map.
*/
pr_info("Early memory node ranges\n");
for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid,
(u64)start_pfn << PAGE_SHIFT,
((u64)end_pfn << PAGE_SHIFT) - 1);
subsection_map_init(start_pfn, end_pfn - start_pfn);
}
/* Initialise every node */
mminit_verify_pageflags_layout();
setup_nr_node_ids();
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
///初始化nid节点
free_area_init_node(nid);
/* Any memory on that node */
if (pgdat->node_present_pages)
node_set_state(nid, N_MEMORY);
check_for_memory(pgdat, nid);
}
///初始化struct page
memmap_init();
}
用于初始化内核的物理内存管理系统,为每个 NUMA 节点的内存区域(zone
)设定边界、初始化数据结构,并完成内存分区的基本配置。
下面是详细解析:
2.1 初始化区域边界记录
memset(arch_zone_lowest_possible_pfn, 0, sizeof(arch_zone_lowest_possible_pfn));
memset(arch_zone_highest_possible_pfn, 0, sizeof(arch_zone_highest_possible_pfn));
```c
memset(arch_zone_lowest_possible_pfn, 0, sizeof(arch_zone_lowest_possible_pfn));
memset(arch_zone_highest_possible_pfn, 0, sizeof(arch_zone_highest_possible_pfn));
- 清零
arch_zone_lowest_possible_pfn
和arch_zone_highest_possible_pfn
,它们记录了每个内存区域的起始和结束页帧编号(PFN)。 - 页帧编号(PFN)是内存页的唯一标识,PFN = 物理地址 / 页大小。
2.2 确定起始PFN
start_pfn = find_min_pfn_with_active_regions();
descending = arch_has_descending_max_zone_pfns();
```C
start_pfn = find_min_pfn_with_active_regions();
descending = arch_has_descending_max_zone_pfns();
find_min_pfn_with_active_regions
:找到最小的活跃页帧编号,作为初始化的起点。arch_has_descending_max_zone_pfns
:检查架构是否支持降序排列的内存区域布局。- 某些架构可能从高地址区域开始分配,这会影响区域的初始化顺序。
2.3 初始化区域边界
for (i = 0; i < MAX_NR_ZONES; i++) {
if (descending)
zone = MAX_NR_ZONES - i - 1;
else
zone = i;
if (zone == ZONE_MOVABLE)
continue;
end_pfn = max(max_zone_pfn[zone], start_pfn);
arch_zone_lowest_possible_pfn[zone] = start_pfn;
arch_zone_highest_possible_pfn[zone] = end_pfn;
start_pfn = end_pfn;
}
```C
for (i = 0; i < MAX_NR_ZONES; i++) {
if (descending)
zone = MAX_NR_ZONES - i - 1;
else
zone = i;
if (zone == ZONE_MOVABLE)
continue;
end_pfn = max(max_zone_pfn[zone], start_pfn);
arch_zone_lowest_possible_pfn[zone] = start_pfn;
arch_zone_highest_possible_pfn[zone] = end_pfn;
start_pfn = end_pfn;
}
- 按架构要求的顺序(升序或降序)遍历所有内存区域(
ZONE_DMA
、ZONE_NORMAL
等)。 end_pfn
确定当前区域的结束页帧编号。max_zone_pfn[zone]
提供区域的最大可能页帧编号。- 如果区域为空,则
start_pfn == end_pfn
。
- 跳过
ZONE_MOVABLE
,因为其边界会在后续单独计算。
2.4 计算 ZONE_MOVABLE
边界
memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
find_zone_movable_pfns_for_nodes();
```C
memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
find_zone_movable_pfns_for_nodes();
ZONE_MOVABLE
是专门为内存热插拔或动态内存管理设计的区域,它的起始 PFN 是动态计算的。find_zone_movable_pfns_for_nodes
:遍历所有节点,确定每个节点中ZONE_MOVABLE
的起始 PFN。
2.5 打印区域信息
pr_info("Zone ranges:\n");
pr_info("Movable zone start for each node\n");
pr_info("Early memory node ranges\n");
```c
pr_info("Zone ranges:\n");
pr_info("Movable zone start for each node\n");
pr_info("Early memory node ranges\n");
- 打印每个内存区域的边界(起始和结束 PFN)。
- 打印
ZONE_MOVABLE
的起点信息。 - 打印内存节点的页帧编号范围。
2.6 初始化每个 NUMA 节点
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
free_area_init_node(nid);
if (pgdat->node_present_pages)
node_set_state(nid, N_MEMORY);
check_for_memory(pgdat, nid);
}
```c
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
free_area_init_node(nid);
if (pgdat->node_present_pages)
node_set_state(nid, N_MEMORY);
check_for_memory(pgdat, nid);
}
- 遍历所有在线 NUMA 节点,调用
free_area_init_node
对每个节点进行初始化。- 设置
pgdat
的内存区域信息。 - 初始化每个区域的
free_area
数据结构(用于伙伴系统)。
- 设置
- 如果节点有可用的内存页,设置其状态为
N_MEMORY
。
注:free_area_init_node
下面单独一章节讲解
2.7 初始化 struct page
memmap_init();
```c
memmap_init();
struct page
是物理页的抽象表示,每个页帧都有一个对应的struct page
。memmap_init
用于对struct page
结构进行初始化,分配并设置内存映射。
2.8 free_area_init总结
free_area_init
是 Linux 内存管理初始化的核心函数,通过划分区域、初始化节点、设置伙伴系统等操作,完成整个物理内存的管理准备工作。- 它依赖于多个辅助函数和架构支持,根据系统配置动态调整行为。
- 该函数的正确执行是保证内核内存分配器(如
alloc_pages
)正常工作的基础。
3. free_area_init_node
static void __init free_area_init_node(int nid)
{
///获取内存节点
pg_data_t *pgdat = NODE_DATA(nid);
unsigned long start_pfn = 0;
unsigned long end_pfn = 0;
/* pg_data_t should be reset to zero when it's allocated */
WARN_ON(pgdat->nr_zones || pgdat->kswapd_highest_zoneidx);
///获取该节点的起始,结束帧
get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
///若为uma系统,只有一个节点,nid=0
pgdat->node_id = nid;
pgdat->node_start_pfn = start_pfn;
pgdat->per_cpu_nodestats = NULL;
///node nid的物理内存地址范围
pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
(u64)start_pfn << PAGE_SHIFT,
end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
///计算spanned_pages, present_pages成员值
calculate_node_totalpages(pgdat, start_pfn, end_pfn);
///分配mem_map空间
alloc_node_mem_map(pgdat);
pgdat_set_deferred_range(pgdat);
///节点核心成员初始化
free_area_init_core(pgdat);
}
```c
static void __init free_area_init_node(int nid)
{
///获取内存节点
pg_data_t *pgdat = NODE_DATA(nid);
unsigned long start_pfn = 0;
unsigned long end_pfn = 0;
/* pg_data_t should be reset to zero when it's allocated */
WARN_ON(pgdat->nr_zones || pgdat->kswapd_highest_zoneidx);
///获取该节点的起始,结束帧
get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
///若为uma系统,只有一个节点,nid=0
pgdat->node_id = nid;
pgdat->node_start_pfn = start_pfn;
pgdat->per_cpu_nodestats = NULL;
///node nid的物理内存地址范围
pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
(u64)start_pfn << PAGE_SHIFT,
end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
///计算spanned_pages, present_pages成员值
calculate_node_totalpages(pgdat, start_pfn, end_pfn);
///分配mem_map空间
alloc_node_mem_map(pgdat);
pgdat_set_deferred_range(pgdat);
///节点核心成员初始化
free_area_init_core(pgdat);
}
重要函数说明
-
calculate_node_totalpages
计算节点的总页数并分类- spanned_pages:节点覆盖的页数,包括无效或保留页
- present_pages:实际可用的页数
-
alloc_node_mem_map为节点的
mem_map
分配空间。mem_map是一个
struct page
数组,每个物理页都有对应的描述符 -
free_area_init_core
初始化节点的核心管理结构,主要包括- 设置各
zone
的范围 - 初始化伙伴系统的
free_area
数据结构
- 设置各
3.1 calculate_node_totalpages
static void __init calculate_node_totalpages(struct pglist_data *pgdat,
unsigned long node_start_pfn,
unsigned long node_end_pfn)
{
unsigned long realtotalpages = 0, totalpages = 0;
enum zone_type i;
for (i = 0; i < MAX_NR_ZONES; i++) {
struct zone *zone = pgdat->node_zones + i;
unsigned long zone_start_pfn, zone_end_pfn;
unsigned long spanned, absent;
unsigned long size, real_size;
spanned = zone_spanned_pages_in_node(pgdat->node_id, i,
node_start_pfn,
node_end_pfn,
&zone_start_pfn,
&zone_end_pfn);
absent = zone_absent_pages_in_node(pgdat->node_id, i,
node_start_pfn,
node_end_pfn);
size = spanned;
real_size = size - absent;
if (size)
zone->zone_start_pfn = zone_start_pfn;
else
zone->zone_start_pfn = 0;
zone->spanned_pages = size;
zone->present_pages = real_size;
#if defined(CONFIG_MEMORY_HOTPLUG)
zone->present_early_pages = real_size;
#endif
totalpages += size;
realtotalpages += real_size;
}
pgdat->node_spanned_pages = totalpages;
pgdat->node_present_pages = realtotalpages;
pr_debug("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
}
```c
static void __init calculate_node_totalpages(struct pglist_data *pgdat,
unsigned long node_start_pfn,
unsigned long node_end_pfn)
{
unsigned long realtotalpages = 0, totalpages = 0;
enum zone_type i;
for (i = 0; i < MAX_NR_ZONES; i++) {
struct zone *zone = pgdat->node_zones + i;
unsigned long zone_start_pfn, zone_end_pfn;
unsigned long spanned, absent;
unsigned long size, real_size;
spanned = zone_spanned_pages_in_node(pgdat->node_id, i,
node_start_pfn,
node_end_pfn,
&zone_start_pfn,
&zone_end_pfn);
absent = zone_absent_pages_in_node(pgdat->node_id, i,
node_start_pfn,
node_end_pfn);
size = spanned;
real_size = size - absent;
if (size)
zone->zone_start_pfn = zone_start_pfn;
else
zone->zone_start_pfn = 0;
zone->spanned_pages = size;
zone->present_pages = real_size;
#if defined(CONFIG_MEMORY_HOTPLUG)
zone->present_early_pages = real_size;
#endif
totalpages += size;
realtotalpages += real_size;
}
pgdat->node_spanned_pages = totalpages;
pgdat->node_present_pages = realtotalpages;
pr_debug("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
}
首先,通过传入 zone_size 调用 zone_spanned_pages_in_node() 统计zone 的:
- start_pfn;
- end_pfn;
- zone 大小,目前返回的就是整个 zone 的物理内存大小;
然后,通过传入 zhole_size 调用 zone_absent_pages_in_node() 统计当前zone 的:
- hole size,目前返回的就是之前计算的 hole 的内存大小;
通过该 zone 的 size 和 hole size,进而计算出该 zone 实际总内存 real_size(去掉 hole之后)。
经过 calculate_node_totalpages() 计算后,得到:
- zone 的 zone_start_pfn,即zone 物理内存的起始页帧号;
- zone 的 spanned_pages,即 zone 横跨的总内存;
- zone 的 present_pages,即 zone 的实际总内存(去掉hole);
- node 的 spanned_pages,即 node 的总内存,所有 zone 总内存之和;
- node 的 present_page,即node 的实际总内存,所有zone 总内存去掉zone 中的hole;
3.2 alloc_node_mem_map
创建node 的memmap,通常指的是没有使用 sparse 模型的情况下,这里暂不做剖析。
只需要知道这个函数受 CONFIG_FLAT_NODE_MEM_MAP 影响:
3.3 free_area_init_core
static void __init free_area_init_core(struct pglist_data *pgdat)
{
enum zone_type j;
int nid = pgdat->node_id;
pgdat_init_internals(pgdat);
pgdat->per_cpu_nodestats = &boot_nodestats;
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long size, freesize, memmap_pages;
size = zone->spanned_pages;
freesize = zone->present_pages;
pr_debug("---j=%d, size=%lu,freesize=%lu\n",j, size, freesize);
/*
* Adjust freesize so that it accounts for how much memory
* is used by this zone for memmap. This affects the watermark
* and per-cpu initialisations
*/
memmap_pages = calc_memmap_size(size, freesize);
if (!is_highmem_idx(j)) {
if (freesize >= memmap_pages) {
freesize -= memmap_pages;
if (memmap_pages)
pr_debug("--- %s zone: %lu pages used for memmap\n",
zone_names[j], memmap_pages);
} else
pr_warn(" %s zone: %lu memmap pages exceeds freesize %lu\n",
zone_names[j], memmap_pages, freesize);
}
/* Account for reserved pages */
if (j == 0 && freesize > dma_reserve) {
freesize -= dma_reserve;
pr_debug(" %s zone: %lu pages reserved\n", zone_names[0], dma_reserve);
}
if (!is_highmem_idx(j))
nr_kernel_pages += freesize;
/* Charge for highmem memmap if there are enough kernel pages */
else if (nr_kernel_pages > memmap_pages * 2)
nr_kernel_pages -= memmap_pages;
nr_all_pages += freesize;
/*
* Set an approximate value for lowmem here, it will be adjusted
* when the bootmem allocator frees pages into the buddy system.
* And all highmem pages will be managed by the buddy system.
*/
zone_init_internals(zone, j, nid, freesize);
if (!size)
continue;
set_pageblock_order();
///设置页块MIGRATE_TYPES类型,操作页块MIGRATE_TYPES类型函数
///get_pageblock_migratetype
///set_pageblock_migratetype
setup_usemap(zone);
///初始化free_area域
init_currently_empty_zone(zone, zone->zone_start_pfn, size);
}
}
```c
static void __init free_area_init_core(struct pglist_data *pgdat)
{
enum zone_type j;
int nid = pgdat->node_id;
pgdat_init_internals(pgdat);
pgdat->per_cpu_nodestats = &boot_nodestats;
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long size, freesize, memmap_pages;
size = zone->spanned_pages;
freesize = zone->present_pages;
pr_debug("---j=%d, size=%lu,freesize=%lu\n",j, size, freesize);
/*
* Adjust freesize so that it accounts for how much memory
* is used by this zone for memmap. This affects the watermark
* and per-cpu initialisations
*/
memmap_pages = calc_memmap_size(size, freesize);
if (!is_highmem_idx(j)) {
if (freesize >= memmap_pages) {
freesize -= memmap_pages;
if (memmap_pages)
pr_debug("--- %s zone: %lu pages used for memmap\n",
zone_names[j], memmap_pages);
} else
pr_warn(" %s zone: %lu memmap pages exceeds freesize %lu\n",
zone_names[j], memmap_pages, freesize);
}
/* Account for reserved pages */
if (j == 0 && freesize > dma_reserve) {
freesize -= dma_reserve;
pr_debug(" %s zone: %lu pages reserved\n", zone_names[0], dma_reserve);
}
if (!is_highmem_idx(j))
nr_kernel_pages += freesize;
/* Charge for highmem memmap if there are enough kernel pages */
else if (nr_kernel_pages > memmap_pages * 2)
nr_kernel_pages -= memmap_pages;
nr_all_pages += freesize;
/*
* Set an approximate value for lowmem here, it will be adjusted
* when the bootmem allocator frees pages into the buddy system.
* And all highmem pages will be managed by the buddy system.
*/
zone_init_internals(zone, j, nid, freesize);
if (!size)
continue;
set_pageblock_order();
///设置页块MIGRATE_TYPES类型,操作页块MIGRATE_TYPES类型函数
///get_pageblock_migratetype
///set_pageblock_migratetype
setup_usemap(zone);
///初始化free_area域
init_currently_empty_zone(zone, zone->zone_start_pfn, size);
}
}
3.3.1 pgdat_init_internals
static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
{
pgdat_resize_init(pgdat);
pgdat_init_split_queue(pgdat);
pgdat_init_kcompactd(pgdat);
init_waitqueue_head(&pgdat->kswapd_wait);
init_waitqueue_head(&pgdat->pfmemalloc_wait);
pgdat_page_ext_init(pgdat);
lruvec_init(&pgdat->__lruvec);
}
```c
static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
{
pgdat_resize_init(pgdat);
pgdat_init_split_queue(pgdat);
pgdat_init_kcompactd(pgdat);
init_waitqueue_head(&pgdat->kswapd_wait);
init_waitqueue_head(&pgdat->pfmemalloc_wait);
pgdat_page_ext_init(pgdat);
lruvec_init(&pgdat->__lruvec);
}
- pgdat_resize_init(), 用以初始化 pgdat->node_size_lock 自旋锁;
- pgdat_init_split_queue(), 针对 HUGEPAGE,需要使能CONFIG_TRANSPARENT_HUGEPAGE;
- pgdat_init_kcompactd(), 初始化等待队列 pgdat->kcompactd_wait;
- init_waitqueue_head(), 初始化等待队列 pgdat->kswapd_wait 和 pgdat->pfmemalloc_wait;
- spin_lock_init(&pgdat->lru_lock),初始化 lru_lock 自旋锁;
- lruvec_init(&pgdat->__lruvec),初始化 pgdat->__lruvec;
3.3.2 初始化per_cou_nodestats
pgdat->per_cpu_nodestats = &boot_nodestats;
pgdat->per_cpu_nodestats = &boot_nodestats;
指定全局变量 boot_nodestats,用以管理每个 cpu 的 node stat,详细可以查看 enum node_stat_item。
3.3.3 calc_memmap_size
static unsigned long __init calc_memmap_size(unsigned long spanned_pages,
unsigned long present_pages)
{
unsigned long pages = spanned_pages;
/*
* Provide a more accurate estimation if there are holes within
* the zone and SPARSEMEM is in use. If there are holes within the
* zone, each populated memory region may cost us one or two extra
* memmap pages due to alignment because memmap pages for each
* populated regions may not be naturally aligned on page boundary.
* So the (present_pages >> 4) heuristic is a tradeoff for that.
*/
if (spanned_pages > present_pages + (present_pages >> 4) &&
IS_ENABLED(CONFIG_SPARSEMEM))
pages = present_pages;
return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
}
```c
static unsigned long __init calc_memmap_size(unsigned long spanned_pages,
unsigned long present_pages)
{
unsigned long pages = spanned_pages;
/*
* Provide a more accurate estimation if there are holes within
* the zone and SPARSEMEM is in use. If there are holes within the
* zone, each populated memory region may cost us one or two extra
* memmap pages due to alignment because memmap pages for each
* populated regions may not be naturally aligned on page boundary.
* So the (present_pages >> 4) heuristic is a tradeoff for that.
*/
if (spanned_pages > present_pages + (present_pages >> 4) &&
IS_ENABLED(CONFIG_SPARSEMEM))
pages = present_pages;
return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
}
该函数用以更准确地统计每个zone 需要多少的 pages 来映射。
这里有个小的算法来选择 pages,当 hole_pages > present_pages/16,则会考虑到spanned_pages 中 hole 过多,而且 hole pages 不一定都是按照页对齐的,所以,需要映射的 pages 最大就可以选择 present_pages。
每个 page 是通过struct page 来管理的,所以,返回值是需要多少个页来分配所有的 struct page。函数调用完成后,函数返回值赋值给变量 memmap_pages,并且会打印出来:
[ 0.000000] DMA32 zone: 484 pages used for memmap
[ 0.000000] DMA32 zone: 0 pages reserved
[ 0.000000] DMA32 zone: 30976 pages, LIFO batch:7
[ 0.000000] Normal zone: 15116 pages used for memmap
[ 0.000000] Normal zone: 952064 pages, LIFO batch:63
[ 0.000000] DMA32 zone: 484 pages used for memmap
[ 0.000000] DMA32 zone: 0 pages reserved
[ 0.000000] DMA32 zone: 30976 pages, LIFO batch:7
[ 0.000000] Normal zone: 15116 pages used for memmap
[ 0.000000] Normal zone: 952064 pages, LIFO batch:63
从log 可以看到,DMA32 中 484 个页面用于memmap
3.3.4 统计nr_kernel_pages和nr_all_pages
当memmap_pages
统计完成后,系统认为这些内存肯定无法再进行使用了,会将这部分内存扣除后,剩下来的内存记录到 nr_kernel_pages
和 nr_all_pages
中。
3.3.5 zone_init_internals()
在分析这个函数之前,需要注意最后一个参数 remaining_pages,是在 zone->present_pages 基础上减去 memmap_pages(详细看第 3.3.3 节) 和 dma_reserve(通常为0)。
另外,需要注意的是这里的 zone->managed_pages 是初期的值,会在 memblock_free_all 函数中被重置,并且在后面的 buddy 初始化中将 reserved 部分移除,得到可用于 buddy 系统管理的最终的 managed_pages。
static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
unsigned long remaining_pages)
{
atomic_long_set(&zone->managed_pages, remaining_pages);
zone_set_nid(zone, nid);
zone->name = zone_names[idx];
zone->zone_pgdat = NODE_DATA(nid);
spin_lock_init(&zone->lock);
zone_seqlock_init(zone);
zone_pcp_init(zone);
}
```cpp
static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
unsigned long remaining_pages)
{
atomic_long_set(&zone->managed_pages, remaining_pages);
zone_set_nid(zone, nid);
zone->name = zone_names[idx];
zone->zone_pgdat = NODE_DATA(nid);
spin_lock_init(&zone->lock);
zone_seqlock_init(zone);
zone_pcp_init(zone);
}
- 将 remaining_pages 存放到 zone->managed_pages 中;
- 对于UMA 来说 zone_set_nid() 无效,就一个node;
- 指定zone name,来自全局数组变量 zone_names;
- 指定zone 的 pgdat,指定隶属于那个zone管辖,后面操作过程中根据zone 可以轻松找到node,对于 UMA 来说,zone_pgdat 就是 contig_page_data;
- 初始化 zone 的lock 自旋锁;
- zone_pcp_init(), 初始化zone 用于pcp 系统的pageset变量;
这里来看下 zone_pcp_init():
static __meminit void zone_pcp_init(struct zone *zone)
{
/*
* per cpu subsystem is not up at this point. The following code
* relies on the ability of the linker to provide the
* offset of a (static) per cpu variable into the per cpu area.
*/
zone->per_cpu_pageset = &boot_pageset;
zone->per_cpu_zonestats = &boot_zonestats;
zone->pageset_high = BOOT_PAGESET_HIGH;
zone->pageset_batch = BOOT_PAGESET_BATCH;
if (populated_zone(zone))
pr_debug(" %s zone: %lu pages, LIFO batch:%u\n", zone->name,
zone->present_pages, zone_batchsize(zone));
}
```c
static __meminit void zone_pcp_init(struct zone *zone)
{
/*
* per cpu subsystem is not up at this point. The following code
* relies on the ability of the linker to provide the
* offset of a (static) per cpu variable into the per cpu area.
*/
zone->per_cpu_pageset = &boot_pageset;
zone->per_cpu_zonestats = &boot_zonestats;
zone->pageset_high = BOOT_PAGESET_HIGH;
zone->pageset_batch = BOOT_PAGESET_BATCH;
if (populated_zone(zone))
pr_debug(" %s zone: %lu pages, LIFO batch:%u\n", zone->name,
zone->present_pages, zone_batchsize(zone));
}
这里是zone 中 pageset 初始化地方,使用全局变量 boot_pageset 的地址;
boot_pageset 是一个Per-CPU 变量,每个 CPU 都有一个本地 struct per_cpu_pageset 数据结构,里面有个成员变量 struct per_cpu_pages 为每个 CPU 维护一个 order 为 0 的链表,详细数据结构可以查看上文第 2.1 节。
回到函数 zone_pcp_init(), 其中 zone_batchsize() 就是计算该zone 中 pcp list 中的 batch 变量值:
static int zone_batchsize(struct zone *zone)
{
#ifdef CONFIG_MMU
int batch;
/*
* The number of pages to batch allocate is either ~0.1%
* of the zone or 1MB, whichever is smaller. The batch
* size is striking a balance between allocation latency
* and zone lock contention.
*/
batch = min(zone_managed_pages(zone) >> 10, (1024 * 1024) / PAGE_SIZE);
batch /= 4; /* We effectively *= 4 below */
if (batch < 1)
batch = 1;
/*
* Clamp the batch to a 2^n - 1 value. Having a power
* of 2 value was found to be more likely to have
* suboptimal cache aliasing properties in some cases.
*
* For example if 2 tasks are alternately allocating
* batches of pages, one task can end up with a lot
* of pages of one half of the possible page colors
* and the other with pages of the other colors.
*/
batch = rounddown_pow_of_two(batch + batch/2) - 1;
return batch;
#else
/* The deferral and batching of frees should be suppressed under NOMMU
* conditions.
*
* The problem is that NOMMU needs to be able to allocate large chunks
* of contiguous memory as there's no hardware page translation to
* assemble apparent contiguous memory from discontiguous pages.
*
* Queueing large contiguous runs of pages for batching, however,
* causes the pages to actually be freed in smaller chunks. As there
* can be a significant delay between the individual batches being
* recycled, this leads to the once large chunks of space being
* fragmented and becoming unavailable for high-order allocations.
*/
return 0;
#endif
}.
```c
static int zone_batchsize(struct zone *zone)
{
#ifdef CONFIG_MMU
int batch;
/*
* The number of pages to batch allocate is either ~0.1%
* of the zone or 1MB, whichever is smaller. The batch
* size is striking a balance between allocation latency
* and zone lock contention.
*/
batch = min(zone_managed_pages(zone) >> 10, (1024 * 1024) / PAGE_SIZE);
batch /= 4; /* We effectively *= 4 below */
if (batch < 1)
batch = 1;
/*
* Clamp the batch to a 2^n - 1 value. Having a power
* of 2 value was found to be more likely to have
* suboptimal cache aliasing properties in some cases.
*
* For example if 2 tasks are alternately allocating
* batches of pages, one task can end up with a lot
* of pages of one half of the possible page colors
* and the other with pages of the other colors.
*/
batch = rounddown_pow_of_two(batch + batch/2) - 1;
return batch;
#else
/* The deferral and batching of frees should be suppressed under NOMMU
* conditions.
*
* The problem is that NOMMU needs to be able to allocate large chunks
* of contiguous memory as there's no hardware page translation to
* assemble apparent contiguous memory from discontiguous pages.
*
* Queueing large contiguous runs of pages for batching, however,
* causes the pages to actually be freed in smaller chunks. As there
* can be a significant delay between the individual batches being
* recycled, this leads to the once large chunks of space being
* fragmented and becoming unavailable for high-order allocations.
*/
return 0;
#endif
}
3
.
batch 为 zone->managed_pages 除以 1024,如果所得的batch 大于 256 时,batch 重新取值为 256;接着用新的batch 除以 4,再减去1.
如下面的log,对于NORMAL zone 来说,因为 managed_pages 过大,所以最终batch 取63:
[ 0.000000] DMA32 zone: 484 pages used for memmap
[ 0.000000] DMA32 zone: 0 pages reserved
[ 0.000000] DMA32 zone: 30976 pages, LIFO batch:7
[ 0.000000] Normal zone: 15116 pages used for memmap
[ 0.000000] Normal zone: 952064 pages, LIFO batch:63
注意:
这里的 batch 并没有存入到 zone 中,只是计算出 present_pages 之后进行的打印。
详细的初始化地方见函数 setup_per_cpu_pageset()(init/main.c->start_kernel() 中调用)。
3.3.6 set_pageblock_order
void __init set_pageblock_order(void)
{
unsigned int order;
/* Check that pageblock_nr_pages has not already been setup */
if (pageblock_order)
return;
if (HPAGE_SHIFT > PAGE_SHIFT)
order = HUGETLB_PAGE_ORDER;
else
order = MAX_ORDER - 1;
/*
* Assume the largest contiguous order of interest is a huge page.
* This value may be variable depending on boot parameters on IA64 and
* powerpc.
*/
pageblock_order = order;
}
设置全局变量 pageblock_order,对于没有设定 CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 的架构体系来说,pageblock_order 使用默认值 (MAX_ORDER - 1)。
3.3.7 init_currently_empty_zone
void __meminit init_currently_empty_zone(struct zone *zone,
unsigned long zone_start_pfn,
unsigned long size)
{
struct pglist_data *pgdat = zone->zone_pgdat;
int zone_idx = zone_idx(zone) + 1;
if (zone_idx > pgdat->nr_zones)
pgdat->nr_zones = zone_idx;
zone->zone_start_pfn = zone_start_pfn;
mminit_dprintk(MMINIT_TRACE, "memmap_init",
"Initialising map node %d zone %lu pfns %lu -> %lu\n",
pgdat->node_id,
(unsigned long)zone_idx(zone),
zone_start_pfn, (zone_start_pfn + size));
zone_init_free_lists(zone);
zone->initialized = 1;
}
这里不用细讲,主要看函数里面的 zone_init_free_lists():
static void __meminit zone_init_free_lists(struct zone *zone)
{
unsigned int order, t;
for_each_migratetype_order(order, t) {
INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
zone->free_area[order].nr_free = 0;
}
}
终于看到了所谓的 zone 的 free list,也就是zone 中的成员变量 free_area数组,这也是buddy 系统的核心数据结构;
当 zone_init_free_lists() 调用完成后,会将 zone->initialized 置为1,标记 zone 的初始化完成。这个变量很关键,这里的初始化完成,并置 initialized 为1,在后面 想要初始化 buddy,必须要这里置1才行。
3.3.8 memmap_init和 __init_single_page
static void __init memmap_init(void)
{
unsigned long start_pfn, end_pfn;
unsigned long hole_pfn = 0;
int i, j, zone_id = 0, nid;
for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
struct pglist_data *node = NODE_DATA(nid);
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = node->node_zones + j;
if (!populated_zone(zone))
continue;
memmap_init_zone_range(zone, start_pfn, end_pfn,
&hole_pfn);
zone_id = j;
}
}
#ifdef CONFIG_SPARSEMEM
/*
* Initialize the memory map for hole in the range [memory_end,
* section_end].
* Append the pages in this hole to the highest zone in the last
* node.
* The call to init_unavailable_range() is outside the ifdef to
* silence the compiler warining about zone_id set but not used;
* for FLATMEM it is a nop anyway
*/
end_pfn = round_up(end_pfn, PAGES_PER_SECTION);
if (hole_pfn < end_pfn)
#endif
init_unavailable_range(hole_pfn, end_pfn, zone_id, nid);
}
在上一小节将 free_area 初始化完成后,zone初始化完成了,需要通过 memmap_init() 对所有page 进行初始化; 通过pfn_to_page(pfn),很容易找到物理page。
然后通过 __init_single_page() 对每个 page 进行单独的初始化:
static void __meminit __init_single_page(struct page *page, unsigned long pfn,
unsigned long zone, int nid)
{
mm_zero_struct_page(page);
set_page_links(page, zone, nid, pfn);
init_page_count(page);
page_mapcount_reset(page);
page_cpupid_reset_last(page);
page_kasan_tag_reset(page);
INIT_LIST_HEAD(&page->lru);
#ifdef WANT_PAGE_VIRTUAL
/* The shift won't overflow because ZONE_NORMAL is below 4G. */
if (!is_highmem_idx(zone))
set_page_address(page, __va(pfn << PAGE_SHIFT));
#endif
}
首先将page 清零;
然后调用 set_page_links() 在 page->flags中 初始化page所属的 zone 以及node id;
接着,将计数 _refcount 设置为1, _mapcount 设置为-1,初始化 _last_cpupid 以及初始化 lru;
注意:这个是page 数据结构首次初始化的地方,page 重要的数据成员 lru(struct list_head)也是在这里初始化;
下面是 __init_single_page() 函数大致流程图:
在 __init_single_page() 结束之后,通过 set_pageblock_migratetype() 将page block 的属性设为 MIGRATE_MOVABLE,如果后续需要申请其他类型的 page,则从MIGRATE_MOVABLE 进行转换;
至此,zone 的初始化过程就分析完成了,大致总结一个流程图如下: