非連續(xù)頁分配
??對于內(nèi)核來說,最好的情況當(dāng)然是分配連續(xù)的物理內(nèi)存,這樣效率高,分配簡單,但是這只是理想情況。當(dāng)系統(tǒng)運(yùn)行較長時間后,會產(chǎn)生大量碎片,導(dǎo)致內(nèi)核內(nèi)存中沒有連續(xù)的大塊內(nèi)存,這在用戶空間是個很簡單的問題,因?yàn)橛脩艨臻g進(jìn)程設(shè)計(jì)為使用處理器的分頁機(jī)制,這會降低效率且TLB占用額外內(nèi)存。內(nèi)核中也有同樣的技術(shù),內(nèi)核分配其虛擬地址空間的一部分用于建立非連續(xù)頁映射,也就是之前提到的vmallo及內(nèi)核映射。
vmalloc
在IA-32系統(tǒng)中,緊隨直接映射的前896M之后,在插入8MB的安全隙之后,是一個用于管理非連續(xù)內(nèi)存的區(qū)域,通過修改內(nèi)核頁表可以將連續(xù)的虛擬內(nèi)存映射到非連續(xù)的物理內(nèi)存。

vmalloc 函數(shù)調(diào)用圖如下:

vmalloc內(nèi)存分配時,先從在虛擬地址空間中找到一塊連續(xù)的虛擬地址,如果虛擬地址空間沒有適合大小的連續(xù)空間,會嘗試一次整理,如果還是找不到就會fail,將vm_struct的地址addr指向虛擬地址,計(jì)算分配的連續(xù)空間的頁數(shù),通過伙伴系統(tǒng)分配指定頁數(shù)的0階物理內(nèi)存,然后將其關(guān)聯(lián)到vm_struct的pages,最終分配結(jié)果如下圖:

vmalloc的代碼如下:
struct vm_struct {
struct vm_struct *next;
void *addr; // 虛擬起始地址: VMALLOC_START+OFFSET
unsigned long size; // 字節(jié)數(shù)
unsigned long flags;
struct page **pages; // 指向分配的不連續(xù)頁
unsigned int nr_pages; // 頁幀數(shù)
phys_addr_t phys_addr; //
const void *caller;
};
struct vmap_area { /* 連續(xù)的虛擬地址段,每個段之間有一頁的警戒頁*/
unsigned long va_start; /* 段虛擬起始地址 */
unsigned long va_end; /* 段虛擬結(jié)尾地址*/
unsigned long flags;
struct rb_node rb_node; /* address sorted rbtree */
struct list_head list; /* address sorted list */
struct llist_node purge_list; /* "lazy purge" list */
struct vm_struct *vm;
struct rcu_head rcu_head;
};
/*
* @size: 字節(jié)數(shù)
* gfp: get free page
*NUMA_NO_NODE: -1
* gfp_mask:
* %GFP_KERNEL
* Allocate normal kernel ram. May sleep.
* %GFP_NOWAIT
* Allocation will not sleep.
* %GFP_ATOMIC
* Allocation will not sleep. May use emergency pools.
* %GFP_HIGHUSER
* Allocate memory from high memory on behalf of user.
*/
void *vmalloc(unsigned long size)
{
return __vmalloc_node_flags(size, NUMA_NO_NODE, GFP_KERNEL);
}
static struct vm_struct *__get_vm_area_node(unsigned long size,
unsigned long align, unsigned long flags, unsigned long start,
unsigned long end, int node, gfp_t gfp_mask, const void *caller)
{
struct vmap_area *va;
struct vm_struct *area;
size = PAGE_ALIGN(size); // size對齊頁的整數(shù)倍
if (flags & VM_IOREMAP)
align = 1ul << clamp_t(int, get_count_order_long(size),
PAGE_SHIFT, IOREMAP_MAX_ORDER);
area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); // 分配指定字節(jié)的內(nèi)存,詳細(xì)內(nèi)容見slab 分配器
if (!(flags & VM_NO_GUARD)) // 添加警戒頁
size += PAGE_SIZE;
va = alloc_vmap_area(size, align, start, end, node, gfp_mask);/* 從指定范圍的虛擬地址空間中分配一個合適大小的連續(xù)空間段 */
if (IS_ERR(va)) {
kfree(area);
return NULL;
}
setup_vmalloc_vm(area, va, flags, caller);
return area;
}
/*
* Allocate a region of KVA of the specified size and alignment, within the
* vstart and vend.
* @size: 段空間大小 (字節(jié))
* @align: 對齊位置
* @vstart,@vend: 在目標(biāo)區(qū)間分配連續(xù)的虛擬地址空間段
*/
static struct vmap_area *alloc_vmap_area(unsigned long size,
unsigned long align,
unsigned long vstart, unsigned long vend,
int node, gfp_t gfp_mask)
{
struct vmap_area *va;
struct rb_node *n;
unsigned long addr;
int purged = 0;
struct vmap_area *first;
va = kmalloc_node(sizeof(struct vmap_area),
gfp_mask & GFP_RECLAIM_MASK, node); /* 分配小塊空間,具體見slab分配器*/
retry:
/* 有些極端場景不使用緩存 */
if (!free_vmap_cache ||
size < cached_hole_size ||
vstart < cached_vstart ||
align < cached_align) {
nocache:
cached_hole_size = 0;
free_vmap_cache = NULL;
}
/* record if we encounter less permissive parameters */
cached_vstart = vstart;
cached_align = align;
/* find starting point for our search */
if (free_vmap_cache) {
first = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
addr = ALIGN(first->va_end, align);
if (addr < vstart) /*緩存中沒有可分配的空間段*/
goto nocache;
if (addr + size < addr) /* 越界了,虛擬空間分配*/
goto overflow;
} else {
addr = ALIGN(vstart, align);
if (addr + size < addr)
goto overflow;
n = vmap_area_root.rb_node;
first = NULL;
while (n) { /* 查找以addr為起始的地址段已經(jīng)被分配vm_area*/
struct vmap_area *tmp;
tmp = rb_entry(n, struct vmap_area, rb_node);
if (tmp->va_end >= addr) {
first = tmp;
if (tmp->va_start <= addr)
break;
n = n->rb_left;
} else
n = n->rb_right;
}
if (!first) /* 以addr為起始的地址空間沒有被占用*/
goto found;
}
/* 遍歷整個vm_area鏈表,查找可用的起始地址空間 */
while (addr + size > first->va_start && addr + size <= vend) {
if (addr + cached_hole_size < first->va_start)
cached_hole_size = first->va_start - addr;
addr = ALIGN(first->va_end, align);
if (addr + size < addr)
goto overflow;
if (list_is_last(&first->list, &vmap_area_list))
goto found;
first = list_next_entry(first, list);
}
found:
/*
* Check also calculated address against the vstart,
* because it can be 0 because of big align request.
*/
if (addr + size > vend || addr < vstart)
goto overflow;
va->va_start = addr;
va->va_end = addr + size;
va->flags = 0;
__insert_vmap_area(va);
free_vmap_cache = &va->rb_node;
spin_unlock(&vmap_area_lock);
return va;
overflow:
spin_unlock(&vmap_area_lock);
if (!purged) {
purge_vmap_area_lazy();
purged = 1;
goto retry;
}
/* 嘗試*/
if (gfpflags_allow_blocking(gfp_mask)) {
unsigned long freed = 0;
blocking_notifier_call_chain(&vmap_notify_list, 0, &freed);
if (freed > 0) {
purged = 0;
goto retry;
}
}
if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit())
pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n",
size);
kfree(va);
return ERR_PTR(-EBUSY);
}
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
pgprot_t prot, int node)
{
struct page **pages;
unsigned int nr_pages, array_size, i;
const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;
const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ?
0 :
__GFP_HIGHMEM;
nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
array_size = (nr_pages * sizeof(struct page *));
area->nr_pages = nr_pages;
/* Please note that the recursion is strictly bounded. */
if (array_size > PAGE_SIZE) {
pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask,
PAGE_KERNEL, node, area->caller);
} else {
pages = kmalloc_node(array_size, nested_gfp, node);
}
area->pages = pages;
if (!area->pages) {
remove_vm_area(area->addr);
kfree(area);
return NULL;
}
for (i = 0; i < area->nr_pages; i++) {
struct page *page;
if (node == NUMA_NO_NODE)
page = alloc_page(alloc_mask|highmem_mask);
else
page = alloc_pages_node(node, alloc_mask|highmem_mask, 0);
if (unlikely(!page)) {
/* Successfully allocated i pages, free them in __vunmap() */
area->nr_pages = i;
goto fail;
}
area->pages[i] = page;
if (gfpflags_allow_blocking(gfp_mask|highmem_mask))
cond_resched();
}
if (map_vm_area(area, prot, pages))
goto fail;
return area->addr;
fail:
warn_alloc(gfp_mask, NULL,
"vmalloc: allocation failure, allocated %ld of %ld bytes",
(area->nr_pages*PAGE_SIZE), area->size);
vfree(area->addr);
return NULL;
}
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
pgprot_t prot, int node)
{
struct page **pages;
unsigned int nr_pages, array_size, i;
const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;
const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ?
0 :
__GFP_HIGHMEM;
nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
array_size = (nr_pages * sizeof(struct page *));
area->nr_pages = nr_pages;
/* Please note that the recursion is strictly bounded. */
if (array_size > PAGE_SIZE) { /* 如果申請的 struct pages內(nèi)存大于一頁則通過vmalloc為struct pages分配*/
pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask,
PAGE_KERNEL, node, area->caller);
} else { /* 通過slab分配 */
pages = kmalloc_node(array_size, nested_gfp, node);
}
area->pages = pages;
if (!area->pages) {
remove_vm_area(area->addr);
kfree(area);
return NULL;
}
for (i = 0; i < area->nr_pages; i++) {
struct page *page;
if (node == NUMA_NO_NODE)
page = alloc_page(alloc_mask|highmem_mask); /* 伙伴系統(tǒng)分配,當(dāng)沒有指定node的時候會根據(jù)mem policy 選擇一個node*/
else
page = alloc_pages_node(node, alloc_mask|highmem_mask, 0); /* 伙伴系統(tǒng)分配 0階內(nèi)存*/
if (unlikely(!page)) {
/* Successfully allocated i pages, free them in __vunmap() */
area->nr_pages = i;
goto fail;
}
area->pages[i] = page;
if (gfpflags_allow_blocking(gfp_mask|highmem_mask))
cond_resched();
}
if (map_vm_area(area, prot, pages))
goto fail;
return area->addr;
fail:
warn_alloc(gfp_mask, NULL,
"vmalloc: allocation failure, allocated %ld of %ld bytes",
(area->nr_pages*PAGE_SIZE), area->size);
vfree(area->addr);
return NULL;
}
vfree
vmalloc是分配內(nèi)存,那么釋放內(nèi)存就是vfree了,其基本邏輯如下圖:

- 通過紅黑樹找到虛擬地址空間vmap_area;
- 釋放虛擬地址空間
- 通過伙伴系統(tǒng)釋放物理內(nèi)存
查找虛擬地址空間的代碼如下:
static struct vmap_area *__find_vmap_area(unsigned long addr)
{
struct rb_node *n = vmap_area_root.rb_node;
while (n) {
struct vmap_area *va;
va = rb_entry(n, struct vmap_area, rb_node);
if (addr < va->va_start)
n = n->rb_left;
else if (addr >= va->va_end)
n = n->rb_right;
else
return va;
}
return NULL;
}
內(nèi)核映射
vmalloc提供了HIGH_MEM zone中虛擬地址到物理地址的映射,但是是匿名隱式映射,沒辦法指定將虛擬地址綁定到具體的物理頁幀,更適用于稍微頻繁一點(diǎn)的分配及釋放。所以內(nèi)核提供了一下其他內(nèi)核映射方式,在內(nèi)存管理(簡介)中有介紹到持久映射及固定映射來適應(yīng)一些其他場景。。
持久化內(nèi)存映射
持久化映射可以指定將虛擬地址空間映射到指定的頁,通過kmap及kunmap來分配及釋放,同vmalloc一樣,持久化映射也需要在虛擬地址空間中指定具體的區(qū)域來做映射,該區(qū)域位于VMMALOOC區(qū)域之后,從PKMAP_BASE到FIXADDR_START,該區(qū)域用于持久映射。
在持久化映射中一個物理頁與一個虛擬地址一一映射,其數(shù)據(jù)結(jié)構(gòu)如下
struct page_address_map {
struct page *page; /* 對應(yīng)于內(nèi)存節(jié)點(diǎn)中的物理頁 */
void *virtual; /* 對于與虛擬地址空間的起始地址*/
struct list_head list;
};
為了便于組織,映射保存在散列表page_address_htable中,結(jié)構(gòu)page_address_map中的鏈表list用于建立散列表中hash碰撞的溢出元素,其內(nèi)存數(shù)據(jù)結(jié)構(gòu)圖如下,其中mem_map在新的內(nèi)核代碼中已經(jīng)更名為node_mem_map;pkmap_count為虛擬頁引用計(jì)數(shù)。

kmap
kmap用于建立物理地址到虛擬起始地址的映射,具體代碼如下:
void *kmap(struct page *page)
{
BUG_ON(in_interrupt());
if (!PageHighMem(page))
return page_address(page);
return kmap_high(page);
}
void *page_address(const struct page *page) /* 獲取指定物理頁的虛擬起始地址 */
{
unsigned long flags;
void *ret;
struct page_address_slot *pas;
if (!PageHighMem(page)) /* 通過page->flags判斷頁所處的zone,如果物理頁處于非高端內(nèi)存時,采用直接映射方式 */
return lowmem_page_address(page);
pas = page_slot(page); /* 在page_address_htable中查詢物理頁對應(yīng)的虛擬起始地址*/
ret = NULL;
spin_lock_irqsave(&pas->lock, flags);
if (!list_empty(&pas->lh)) {
struct page_address_map *pam;
list_for_each_entry(pam, &pas->lh, list) { /* hash碰撞處理 */
if (pam->page == page) {
ret = pam->virtual;
goto done;
}
}
}
done:
spin_unlock_irqrestore(&pas->lock, flags);
return ret;
}
void *kmap_high(struct page *page)
{
unsigned long vaddr;
/*
* For highmem pages, we can't trust "virtual" until
* after we have the lock.
*/
lock_kmap();
vaddr = (unsigned long)page_address(page); /*查看是否已經(jīng)被映射過*/
if (!vaddr)
vaddr = map_new_virtual(page); /* 獲取一個未分配的虛擬頁,并將映射更新到散列表中*/
pkmap_count[PKMAP_NR(vaddr)]++; /* 映射引用計(jì)數(shù)加1*/
BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 2);
unlock_kmap();
return (void*) vaddr;
}
static inline unsigned long map_new_virtual(struct page *page)
{
unsigned long vaddr;
int count;
unsigned int last_pkmap_nr;
unsigned int color = get_pkmap_color(page);
start:
count = get_pkmap_entries_count(color); /*當(dāng)沒有可映射虛擬空間時,非sleep重試的次數(shù)*/
/* Find an empty entry */
for (;;) {
last_pkmap_nr = get_next_pkmap_nr(color);
if (no_more_pkmaps(last_pkmap_nr, color)) {
flush_all_zero_pkmaps(); /* 這是釋放映射的關(guān)鍵,下面介紹unmap的時候詳細(xì)介紹*/
count = get_pkmap_entries_count(color);
}
if (!pkmap_count[last_pkmap_nr])
break; /* Found a usable entry */
if (--count)
continue;
/*
* Sleep for somebody else to unmap their entries
*/
{
DECLARE_WAITQUEUE(wait, current);
wait_queue_head_t *pkmap_map_wait =
get_pkmap_wait_queue_head(color);
__set_current_state(TASK_UNINTERRUPTIBLE);
add_wait_queue(pkmap_map_wait, &wait);
unlock_kmap();
schedule();
remove_wait_queue(pkmap_map_wait, &wait);
lock_kmap();
/* Somebody else might have mapped it while we slept */
if (page_address(page))
return (unsigned long)page_address(page);
/* Re-start */
goto start;
}
}
vaddr = PKMAP_ADDR(last_pkmap_nr);
set_pte_at(&init_mm, vaddr,
&(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot));
pkmap_count[last_pkmap_nr] = 1; /*初始化就設(shè)為1,映射后還會+1,因此只有引用計(jì)數(shù)為2及以上的才是有真正映射的*/
set_page_address(page, (void *)vaddr);
return vaddr;
}
kunmap
kunmap用于釋放映射,這是一個體系結(jié)構(gòu)相關(guān)的函數(shù),但大體實(shí)現(xiàn)差不多。
void kunmap(struct page *page)
{
if (in_interrupt())
BUG();
if (!PageHighMem(page))
return;
kunmap_high(page);
}
void kunmap_high(struct page *page)
{
unsigned long vaddr;
unsigned long nr;
unsigned long flags;
int need_wakeup;
unsigned int color = get_pkmap_color(page);
wait_queue_head_t *pkmap_map_wait;
lock_kmap_any(flags);
vaddr = (unsigned long)page_address(page);
BUG_ON(!vaddr);
nr = PKMAP_NR(vaddr); /*獲取虛擬地址對應(yīng)的虛擬頁*/
need_wakeup = 0;
switch (--pkmap_count[nr]) {
case 0:
BUG();
case 1:
pkmap_map_wait = get_pkmap_wait_queue_head(color);
need_wakeup = waitqueue_active(pkmap_map_wait);
}
unlock_kmap_any(flags);
if (need_wakeup)
wake_up(pkmap_map_wait);
}
從上面的代碼可以看出來kunmap并不會真正的釋放頁,而只是標(biāo)記內(nèi)存map_count為1,表示已分配,但是未映射,只有在建立映射沒有可用虛擬地址時才清理一下未映射的虛擬地址,這在內(nèi)存壓力不是特別大的時候能大大提高分配效率,但是當(dāng)虛擬內(nèi)存被分配比較滿的時候會大大降低分配效率。
固定映射
在上述代買中可以看到kmap有可能會sleep,所以不能用于中斷處理程序,至于為啥中斷程序是如何工作的,以及他為啥需要保證原子性,這個后續(xù)在中斷及信號處理,再詳細(xì)說明,所以提供了kmap_atomic來建立映射,kmap_atomic是一個體系機(jī)構(gòu)相關(guān)的函數(shù),其在IA-32體系機(jī)構(gòu)中是通過關(guān)掉搶占及關(guān)掉pagefault來實(shí)現(xiàn)禁止上下文切換,然后做直接映射,其代碼如下:
void *kmap_atomic_prot(struct page *page, pgprot_t prot)
{
unsigned long vaddr;
int idx, type;
preempt_disable();
pagefault_disable();
if (!PageHighMem(page))
return page_address(page);
type = kmap_atomic_idx_push(); /* 原子性自增id*/
idx = type + KM_TYPE_NR*smp_processor_id();
vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
BUG_ON(!pte_none(*(kmap_pte-idx)));
set_pte(kmap_pte-idx, mk_pte(page, prot));
arch_flush_lazy_mmu_mode();
return (void *)vaddr;
}
#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
其分配圖如下:

依次映射到虛擬地址空間。
固定映射主要用與虛擬地址映射及內(nèi)核內(nèi)存與用戶空間內(nèi)存拷貝
如此內(nèi)核就可以通過vmalloc分配非連續(xù)物理地址,持久化映射及固定映射來建立虛擬地址到物理內(nèi)存的非連續(xù)映射。
拓展閱讀
mem_policy
https://www.kernel.org/doc/Documentation/vm/numa_memory_policy.txt