0. 前言
在上一篇文章中我们分析了__alloc_pages
中的get_page_from_freelist
,也就是快速分配部分。这个函数会根据分配掩码和分配order进行快速分配,若快速分配过程并不能分配到合适的内存时,则会进入慢速分配的过程。
本文紧接前文继续分析__alloc_pages
函数,继续剖析buddy内存分配的另一个过程:慢速分配
1. __alloc_pages_slowpath
当快速路径分配内存失败时,内核会调用这个函数来尝试各种方法(如回收内存、整理内存碎片、甚至启动 OOM 杀手)来成功分配内存
/*
* 直接分配内存块失败,进入慢速路径
* */
static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct alloc_context *ac)
{
///是否支持直接回收,GFP_KERNEL,GFP_HIGHUSER_MOVABLE等
bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
///判断当前请求的页面分配是否为高成本的页面请求(即分配多个页面)
const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
// 初始化一个指针,用来保存分配的页面地址
struct page *page = NULL;
// 用于存储分配的标志
unsigned int alloc_flags;
// 记录回收过程是否有进展
unsigned long did_some_progress;
// 用于指定页面整理(compaction)时的优先级
enum compact_priority compact_priority;
// 用于保存整理操作的结果
enum compact_result compact_result;
// 用于记录页面整理操作的重试次数
int compaction_retries;
// 用于记录没有进展的循环次数
int no_progress_loops;
// 用于处理 CPU 集合更新时的竞争条件
unsigned int cpuset_mems_cookie;
// 用于表示系统是否允许访问内存预留区域
int reserve_flags;
/*
* We also sanity check to catch abuse of atomic reserves being used by
* callers that are not in atomic context.
*/
///检查__GFP_ATOMIC,用在中断上下文,优先级较高,检查是否滥用,这两个标签是无法同时使用的,如果同时使用,则发出警告并去除GFP_ATOMIC
if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==
(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
gfp_mask &= ~__GFP_ATOMIC;
retry_cpuset:
compaction_retries = 0;
no_progress_loops = 0;
compact_priority = DEF_COMPACT_PRIORITY;
cpuset_mems_cookie = read_mems_allowed_begin();
/*
* The fast path uses conservative alloc_flags to succeed only until
* kswapd needs to be woken up, and to avoid the cost of setting up
* alloc_flags precisely. So we do that now.
*/
///因为低水位分配失败,才走到这里,修改标志为最低水位线,后面重新尝试分配
alloc_flags = gfp_to_alloc_flags(gfp_mask);
/*
* We need to recalculate the starting point for the zonelist iterator
* because we might have used different nodemask in the fast path, or
* there was a cpuset modification and we are retrying - otherwise we
* could end up iterating over non-eligible zones endlessly.
*/
///重新选择优先zone
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->highest_zoneidx, ac->nodemask);
if (!ac->preferred_zoneref->zone)
goto nopage;
///唤醒kswapd内核线程,回收页面
if (alloc_flags & ALLOC_KSWAPD)
wake_all_kswapds(order, gfp_mask, ac);
/*
* The adjusted alloc_flags might result in immediate success, so try
* that first
*/
///kswapd回收后,再次尝试直接内存分配
page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
if (page)
goto got_pg;
/*
* For costly allocations, try direct compaction first, as it's likely
* that we have enough base pages and don't need to reclaim. For non-
* movable high-order allocations, do that as well, as compaction will
* try prevent permanent fragmentation by migrating from blocks of the
* same migratetype.
* Don't try this for allocations that are allowed to ignore
* watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.
*/
if (can_direct_reclaim &&
(costly_order ||
(order > 0 && ac->migratetype != MIGRATE_MOVABLE))
&& !gfp_pfmemalloc_allowed(gfp_mask)) {
///进行页面规整,异步模式,触发条件:
///允许直接回收
///高成本,大order页分配需求
///不能访问系统预留内存
page = __alloc_pages_direct_compact(gfp_mask, order,
alloc_flags, ac,
INIT_COMPACT_PRIORITY,
&compact_result);
if (page)
goto got_pg;
/*
* Checks for costly allocations with __GFP_NORETRY, which
* includes some THP page fault allocations
*/
if (costly_order && (gfp_mask & __GFP_NORETRY)) {
/*
* If allocating entire pageblock(s) and compaction
* failed because all zones are below low watermarks
* or is prohibited because it recently failed at this
* order, fail immediately unless the allocator has
* requested compaction and reclaim retry.
*
* Reclaim is
* - potentially very expensive because zones are far
* below their low watermarks or this is part of very
* bursty high order allocations,
* - not guaranteed to help because isolate_freepages()
* may not iterate over freed pages as part of its
* linear scan, and
* - unlikely to make entire pageblocks free on its
* own.
*/
if (compact_result == COMPACT_SKIPPED ||
compact_result == COMPACT_DEFERRED)
goto nopage;
/*
* Looks like reclaim/compaction is worth trying, but
* sync compaction could be very expensive, so keep
* using async compaction.
*/
compact_priority = INIT_COMPACT_PRIORITY;
}
}
retry:
/* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
if (alloc_flags & ALLOC_KSWAPD)
///唤醒kswapd内核线程
wake_all_kswapds(order, gfp_mask, ac);
///修改分配属性,可以分配系统预留内存
reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
if (reserve_flags)
alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, reserve_flags);
/*
* Reset the nodemask and zonelist iterators if memory policies can be
* ignored. These allocations are high priority and system rather than
* user oriented.
*/
if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
ac->nodemask = NULL;
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->highest_zoneidx, ac->nodemask);
}
///kswapd工作后,再尝试分配
/* Attempt with potentially adjusted zonelist and alloc_flags */
page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
if (page)
goto got_pg;
/* Caller is not willing to reclaim, we can't balance anything */
if (!can_direct_reclaim)
goto nopage;
/* Avoid recursion of direct reclaim */
if (current->flags & PF_MEMALLOC)
goto nopage;
///触发直接内存回收
/* Try direct reclaim and then allocating */
page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
&did_some_progress);
if (page)
goto got_pg;
///再次尝试页面规整,然后重新分配内存
/* Try direct compaction and then allocating */
page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
compact_priority, &compact_result);
if (page)
goto got_pg;
/* Do not loop if specifically requested */
if (gfp_mask & __GFP_NORETRY)
goto nopage;
/*
* Do not retry costly high order allocations unless they are
* __GFP_RETRY_MAYFAIL
*/
if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
goto nopage;
///重试直接内存回收,会增加no_progress_loops计数
if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
did_some_progress > 0, &no_progress_loops))
goto retry;
/*
* It doesn't make any sense to retry for the compaction if the order-0
* reclaim is not able to make any progress because the current
* implementation of the compaction depends on the sufficient amount
* of free memory (see __compaction_suitable)
*/
///是否重新内存规整
if (did_some_progress > 0 &&
should_compact_retry(ac, order, alloc_flags,
compact_result, &compact_priority,
&compaction_retries))
goto retry;
/* Deal with possible cpuset update races before we start OOM killing */
if (check_retry_cpuset(cpuset_mems_cookie, ac))
goto retry_cpuset;
///回收,页面规整都失败,尝试启动oom
/* Reclaim has failed us, start killing things */
page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
if (page)
goto got_pg;
/* Avoid allocations with no watermarks from looping endlessly */
if (tsk_is_oom_victim(current) &&
(alloc_flags & ALLOC_OOM ||
(gfp_mask & __GFP_NOMEMALLOC)))
goto nopage;
/* Retry as long as the OOM killer is making progress */
if (did_some_progress) {
no_progress_loops = 0;
goto retry;
}
nopage:
/* Deal with possible cpuset update races before we fail */
if (check_retry_cpuset(cpuset_mems_cookie, ac))
goto retry_cpuset;
/*
* Make sure that __GFP_NOFAIL request doesn't leak out and make sure
* we always retry
*/
if (gfp_mask & __GFP_NOFAIL) {
/*
* All existing users of the __GFP_NOFAIL are blockable, so warn
* of any new users that actually require GFP_NOWAIT
*/
if (WARN_ON_ONCE(!can_direct_reclaim))
goto fail;
/*
* PF_MEMALLOC request from this context is rather bizarre
* because we cannot reclaim anything and only can loop waiting
* for somebody to do a work for us
*/
WARN_ON_ONCE(current->flags & PF_MEMALLOC);
/*
* non failing costly orders are a hard requirement which we
* are not prepared for much so let's warn about these users
* so that we can identify them and convert them to something
* else.
*/
WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER);
/*
* Help non-failing allocations by giving them access to memory
* reserves but do not use ALLOC_NO_WATERMARKS because this
* could deplete whole memory reserves which would just make
* the situation worse
*/
page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac);
if (page)
goto got_pg;
cond_resched();
goto retry;
}
fail:
///所有分配尝试都失败,打印函数调用栈,系统信息
warn_alloc(gfp_mask, ac->nodemask,
"page allocation failure: order:%u", order);
got_pg:
return page;
}
下面针对一些重要的细节单独分析:
1.1 can_direct_reclaim
这个参数是用来表示是否允许调用直接内存回收的,那些隐含了 __GFP_DIRECT_RECLAIM 标志的分配掩码都会使用直接页面回收机制,如常用的 GFP_KERNEL、GFP_KERNEL_ACCOUNT、GFP_NOWAIT、GFP_NOIO、GFP_NOFS、GFP_USER、GFP_HIGHUSER_MOVABLE 等
1.2 costly_order
costly_order 高成本的申请,表示会形成一定的内存分配压力。 PAGE_ALLOC_CONSTLY_ORDER 定义为3,如当分配请求 order 为4,即要分配 64KB 大小的物理内存,会给页面分配器带来一定的内存压力
1.3 检查滥用__gfp_ATOMIC
if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==
(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
gfp_mask &= ~__GFP_ATOMIC;
__GFP_ATIOMIC 表示调用页面分配器的进程不能直接回收页面或休眠等待,调用者通常在中断上下文中,使用与 GFP_KERNEL 相反的,GFP_KERNEL 是内核分配的常用标志之一,它 可能会被阻塞,即分配过程中可能会睡眠,在中断下文和不能睡眠的内核路径里使用该类型标志需要特别警惕,因为这个会引起死锁或者其他系统异常
另外,__GFP_ATOMIC 是优先级比较高的分配行为,它允许访问部分的系统预留内存
1.4 尝试直接内存规整
if (can_direct_reclaim &&
(costly_order ||
(order > 0 && ac->migratetype != MIGRATE_MOVABLE))
&& !gfp_pfmemalloc_allowed(gfp_mask)) {
page = __alloc_pages_direct_compact(gfp_mask, order,
alloc_flags, ac,
INIT_COMPACT_PRIORITY,
&compact_result);
若以最低警戒水位无法分配到内存,可以在满足以下的条件时,考虑尝试先进行 直接内存规整 来解决页面分配失败的问题
- can_direct_reclaim 为true,即允许调用直接页面回收机制;
- 高成本申请,这时系统有可能有足够的空闲内存,但是没有满足分配需要的 order,调用内存规整机制可能解决这个问题。或者order 大于0,但申请非 MOVABLE 页面;
- 不能访问系统预留内存。 gfp_pfmemalloc_allowed() 函数表示是否允许访问系统预留的内存,若返回 ALLOC_NO_WATERMARKS,表示不用考虑水位;若返回 0,表示不允许访问系统预留内存;
上面三个条件是并且关系,且内存规整的priority 是COMPACT_PRIO_ASYNC,即异步模式。
1.5 __alloc_pages_direct_reclaim
直接内存回收的核心处理函数,后面单独出一篇分析
{% tip warning %}
TODO
{% endtip %}
1.6 __alloc_pages_direct_compact
直接内存规整的核心处理函数,后面单独出一篇分析
{% tip warning %}
TODO
{% endtip %}
1.7 warn_alloc
如果经过一些列的尝试之后还是无法分配到需要的page 时,会调用 warn_alloc() 来宣告此次内存分配失败,输出内核打印信息:
fail:
///所有分配尝试都失败,打印函数调用栈,系统信息
warn_alloc(gfp_mask, ac->nodemask,
"page allocation failure: order:%u", order);
void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
{
struct va_format vaf;
va_list args;
static DEFINE_RATELIMIT_STATE(nopage_rs, 10*HZ, 1);
if ((gfp_mask & __GFP_NOWARN) ||
!__ratelimit(&nopage_rs) ||
((gfp_mask & __GFP_DMA) && !has_managed_dma()))
return;
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl",
current->comm, &vaf, gfp_mask, &gfp_mask,
nodemask_pr_args(nodemask));
va_end(args);
cpuset_print_current_mems_allowed();
pr_cont("\n");
dump_stack();
warn_alloc_show_mem(gfp_mask, nodemask);
}
输出信息:
- 基本信息;
- 此次分配内存的进程名
- 字符串 page allocation failure: //// 这个也是稳定性经常遇到的问题的关键字
- order
- gfp_mask
- 等等
- 上下文调用堆栈
- 系统内存信息
<4>[ 176.567334] Binder:8340_2: page allocation failure: order:0, mode:0x100cca(GFP_HIGHUSER_MOVABLE), nodemask=(null),cpuset=foreground,mems_allowed=0
<4>[ 176.567356] CPU: 5 PID: 8360 Comm: Binder:8340_2 Tainted: G O 5.4.191-jgki-perfuser-android11-2-g4d2e507adb3c #1
<4>[ 176.567359] Hardware name: jlq,armv8 (DT)
<4>[ 176.567362] Call trace:
<4>[ 176.567376] dump_backtrace.cfi_jt+0x0/0x4
<4>[ 176.567385] show_stack+0x18/0x24
<4>[ 176.567391] dump_stack+0xb8/0x114
<4>[ 176.567397] warn_alloc+0xf8/0x14c
<4>[ 176.567401] __alloc_pages_slowpath+0x81c/0x830
<4>[ 176.567404] __alloc_pages_nodemask+0x2c8/0x334
<4>[ 176.567410] __read_swap_cache_async+0xbc/0x258
<4>[ 176.567414] swap_vma_readahead+0x378/0x404
<4>[ 176.567418] swapin_readahead+0x30/0x3c
<4>[ 176.567423] do_swap_page+0x1e8/0x868
<4>[ 176.567427] handle_pte_fault+0x10c/0x28c
<4>[ 176.567430] handle_mm_fault+0x1b0/0x2e8
<4>[ 176.567437] do_page_fault+0x28c/0x46c
<4>[ 176.567441] do_translation_fault+0x3c/0x54
<4>[ 176.567447] do_mem_abort+0x64/0xf4
<4>[ 176.567450] el1_da+0x1c/0xc0
<4>[ 176.567454] __arch_copy_to_user+0x58/0x218
<4>[ 176.567462] binder_ioctl_write_read+0x3fc/0x450
<4>[ 176.567467] binder_ioctl+0x230/0x10b0
<4>[ 176.567473] do_vfs_ioctl+0x380/0x6a0
<4>[ 176.567477] __arm64_sys_ioctl+0x74/0xa0
<4>[ 176.567483] el0_svc_common+0xc4/0x1ac
<4>[ 176.567487] el0_svc_handler+0x74/0x98
<4>[ 176.567490] el0_svc+0x8/0x100
<4>[ 176.567494] Mem-Info:
<4>[ 176.567507] active_anon:116861 inactive_anon:103507 isolated_anon:196\x0a active_file:97697 inactive_file:116460 isolated_file:84\x0a unevictable:16305 dirty:571 writeback:0 unstable:0\x0a slab_reclaimable:17882 slab_unreclaimable:48451\x0a mapped:168991 shmem:2720 pagetables:18303 bounce:0\x0a free:12246 free_pcp:517 free_cma:0
<4>[ 176.567515] Node 0 active_anon:467444kB inactive_anon:414028kB active_file:390788kB inactive_file:465840kB unevictable:65220kB isolated(anon):784kB isolated(file):336kB mapped:675964kB dirty:2284kB writeback:0kB shmem:10880kB writeback_tmp:0kB unstable:0kB all_unreclaimable? no
<4>[ 176.567525] DMA32 free:9444kB min:4380kB low:5032kB high:5100kB active_anon:16604kB inactive_anon:17068kB active_file:16808kB inactive_file:22124kB unevictable:20kB writepending:0kB present:123904kB managed:119808kB mlocked:20kB kernel_stack:2056kB shadow_call_stack:504kB pagetables:1960kB bounce:0kB free_pcp:0kB local_pcp:0kB free_cma:0kB
<4>[ 176.567526] lowmem_reserve[]: 0 1276 1276
<4>[ 176.567539] Normal free:39540kB min:39544kB low:53928kB high:55488kB active_anon:451384kB inactive_anon:395620kB active_file:373536kB inactive_file:444772kB unevictable:65200kB writepending:2252kB present:2759680kB managed:2614944kB mlocked:65200kB kernel_stack:53928kB shadow_call_stack:13536kB pagetables:71252kB bounce:0kB free_pcp:2124kB local_pcp:4kB free_cma:0kB
<4>[ 176.567541] lowmem_reserve[]: 0 0 0
<4>[ 176.567545] DMA32: 69*4kB (UMEH) 55*8kB (UMEH) 136*16kB (UMEH) 85*32kB (UMEH) 31*64kB (UMEH) 10*128kB (H) 3*256kB (MH) 0*512kB 0*1024kB 0*2048kB 0*4096kB = 9644kB
<4>[ 176.567568] Normal: 571*4kB (UMEH) 970*8kB (UMEH) 1209*16kB (UME) 258*32kB (UMEH) 36*64kB (UMH) 2*128kB (H) 1*256kB (H) 0*512kB 0*1024kB 0*2048kB 0*4096kB = 40460kB
<4>[ 176.567588] 251944 total pagecache pages
<4>[ 176.567593] 18834 pages in swap cache
<4>[ 176.567596] Swap cache stats: add 564072, delete 545202, find 763727/925265
<4>[ 176.567598] Free swap = 811260kB
<4>[ 176.567600] Total swap = 1572860kB
<4>[ 176.567602] 720896 pages RAM
<4>[ 176.567604] 0 pages HighMem/MovableOnly
<4>[ 176.567606] 37208 pages reserved
<4>[ 176.567607] 5120 pages cma reserved
例如这里,free pages 有39540kB,min 水位为 39544kB,已经处于内存严重不足的情况,在经过直接内存回收并不能回收到需要分配的页面时,就会进入到这里
2. 慢速分配的流程图
3. 页框分配器总结
伙伴系统分配可用页框给申请者时,首先会根据zonelist对每个可用的zone进行快速分配,成功则返回第一个页框的页描述符,如果所有zone的快速分配都不成功,则会zonelist中的zone进行慢速分配,慢速分配中会进行内存回收、内存压缩和唤醒kswapd线程也同时进行内存的回收工作,之后再尝试继续分配
在快速分配中,如果条件允许会以low阀值遍历两次zonelist中的zone,整个快速分配的流程是:从zonelist中取出一个zone,检查此zone标志判断是否可通过此zone分配内存,如果 zone的空闲内存 - 需要申请的内存 < 阀值 ,伙伴系统则会将zone的一些快速内存回收,然后再次判断阀值和空闲内存与申请内存大小直接的关系,如果 zone的空闲内存 - 需要申请的内存 > 阀值,则调用buffered_rmqueue()函数从此zone中的分配内存,否则,选取下一个zone继续执行这段操作。当zonelist中的所有zone都遍历完成后,还是没有分配到内存,如果条件允许会再次遍历一遍。由于在慢速过程中也会调用此函数进行快速内存分配,所以阀值是由调用者传进来,因为不同情况使用的阀值是不同的,比如第一次快速分配过程中,会使用zone的low阀值进行分配,而进入慢速分配过程中,会使用min阀值进行分配。
在伙伴系统中有一个每CPU高速缓存,里面保存着以migratetype分类的单页框的双向链表,当申请内存者只需要一个页框时,内核会从每CPU高速缓存中相应类型的单页框链表中获取一个页框交给申请者,这样的好处是,但释放单个页框时会放入每CPU高速缓存链表,如果这时有需要申请单个页框,就把这个刚刚释放的页框交付出去,因为这个页框可能还存在于cache中,处理时就可直接处理cache而不用把这个页框再放入cache中,提高了cache的命中率,这样的页框就称为“热”页。每CPU高速缓存维护的这些所有类型的单页框双向链表时,把刚释放的页框从链表头插入,申请“热”页时就从链表头拿出页框,申请“冷”页时则从链表位拿出。
最后整理一下,如果一次分配,从开始到最后都没有成功,所走的路径是:
- 遍历zonelist,从zonelist中获取一个zone
- 检查zone如果分配后,空闲页框是否会低于allow_low
- 对此zone回收一些文件映射页和slab使用的页
- 再次检查zone如果分配后,空闲页框是否会低于allow_low
- 尝试从此zone分配页框(1个页优先从每CPU高速缓存分配,连续页框优先从需要的类型(migratetype)分配,如果不行再从其他migratetype分配)
- free_order小于11的情况, free_order++, 再次尝试第5步.如果free_order大于等于11, 则走第7步
- 跳到第1步,遍历zonelist结束则到下一步
- 再重新遍历zonelist一次,如果重新遍历过则到下一步
- 进入慢速分配
- 唤醒所有kswapd内核线程
- 再次尝试一次1~7步骤进行分配
- 如果有ALLOC_NO_WATERMARKS,则尝试分配预留的内存
- 进行异步内存压缩,然后尝试分配内存
- 尝试调用__alloc_pages__direct_reclaim()进行内存回收,然后尝试分配内存
- 使用oom杀掉oom_score较大的进程,每个进程都有一个oom_score(在/proc/PID/oom_score)
- 尝试轻同步内存压缩,然后尝试分配内存
- 压缩后再次尝试1~7步骤进行分配