测试

倾然丶 夕夏残阳落幕 提交于 2019-11-26 17:15:06

linux内存管理

2019-05-05 by jinliang

 物理内存

内存结构:

img

img

物理内存描述

内存模型:

  • UMA 一致性内存访问
  • NUMA 非一致性内存访问

查看手机发现CONFIG_NUMA未定义,所以当前手机为UMA模型

节点 pg_data_t

pg_data_t用于表示节点的基本元素,定义如下:

./include/linux/mmzone.h

typedef struct pglist_data {
    //包含了节点中各个内存域的数据结构,最大3个,不足的话,其余数组用0填充.
    struct zone node_zones[MAX_NR_ZONES];
    //制定了备用节点机器内存域的列表,以便在当前节点没有可用空间时,在备用节点分配内存.
    struct zonelist node_zonelists[MAX_ZONELISTS];
    //节点中不同内存域的数目保存在nr_zones中.
    int nr_zones;
#ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */
    struct page *node_mem_map;
#ifdef CONFIG_PAGE_EXTENSION
    struct page_ext *node_page_ext;
#endif
#endif
#ifndef CONFIG_NO_BOOTMEM
    struct bootmem_data *bdata;
#endif
#ifdef CONFIG_MEMORY_HOTPLUG
    /*
     * Must be held any time you expect node_start_pfn, node_present_pages
     * or node_spanned_pages stay constant.  Holding this will also
     * guarantee that any pfn_valid() stays that way.
     *
     * pgdat_resize_lock() and pgdat_resize_unlock() are provided to
     * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG.
     *
     * Nests above zone->lock and zone->span_seqlock
     */
    spinlock_t node_size_lock;
#endif
//  该节点第一个页帧的逻辑编号.系统所有节点的页帧是依次编号的,每个页帧的号码是全局唯一的.
    unsigned long node_start_pfn;
//  制定了结点中页帧的数目,而node_spanned_pages则给出了该节点以页帧为单位计算的长度.二者
//  二者值可能不同,因为节点中可能有一些空洞,并不对应真正的页帧.
    unsigned long node_present_pages; /* total number of physical pages */
    unsigned long node_spanned_pages; /* total size of physical page
                         range, including holes */
    int node_id;
    wait_queue_head_t kswapd_wait;
    wait_queue_head_t pfmemalloc_wait;
    struct task_struct *kswapd; /* Protected by
                       mem_hotplug_begin/end() */
    int kswapd_order;
    enum zone_type kswapd_classzone_idx;

    int kswapd_failures;        /* Number of 'reclaimed == 0' runs */
    /*
     * This is a per-node reserve of pages that are not available
     * to userspace allocations.
     */
    unsigned long       totalreserve_pages;
    /* Write-intensive fields used by page reclaim */
    ZONE_PADDING(_pad1_)
    /* 用于对zone中LRU链表并行访问时进行保护的自旋锁*/
    spinlock_t      lru_lock;
    /* Fields commonly accessed by the page reclaim scanner */
    struct lruvec       lruvec;

    /*
     * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
     * this node's LRU.  Maintained by the pageout code.
     */
    unsigned int inactive_ratio;

    unsigned long       flags;

    ZONE_PADDING(_pad2_)

    /* Per-node vmstats */
    struct per_cpu_nodestat __percpu *per_cpu_nodestats;
    atomic_long_t       vm_stat[NR_VM_NODE_STAT_ITEMS];
} pg_data_t;

管理区zone

一个管理区域通过struct zone_struct描述, 其被定义为zone_t, 用以表示内存的某个范围

./include/linux/mmzone.h

struct zone {
    /* Read-mostly fields */

    /* zone watermarks, access with *_wmark_pages(zone) macros */
    /* 每個zone在系統啓動的時候会计算出3个水位值,分別是WMARK_MIN/WMARK_LOW WMARK_HIGH水位,
     * 在這個頁面分配器和kswapd頁面回收中会用到*/
    unsigned long watermark[NR_WMARK];

    unsigned long nr_reserved_highatomic;

    /*
     * We don't know if the memory that we're going to allocate will be
     * freeable or/and it will be released eventually, so to avoid totally
     * wasting several GB of ram we must reserve some of the lower zone
     * memory (otherwise we risk to run OOM on the lower zones despite
     * there being tons of freeable ram on the higher zones).  This array is
     * recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl
     * changes.
     */
    long lowmem_reserve[MAX_NR_ZONES];

#ifdef CONFIG_NUMA
    int node;
#endif
    /* 指向内存节点*/
    struct pglist_data  *zone_pgdat;
    /*用于维护per-cpu上的一系列页面,以减少自旋锁的争用*/
    struct per_cpu_pageset __percpu *pageset;

#ifndef CONFIG_SPARSEMEM
    /*
     * Flags for a pageblock_nr_pages block. See pageblock-flags.h.
     * In SPARSEMEM, this map is stored in struct mem_section
     */
    unsigned long       *pageblock_flags;
#endif /* CONFIG_SPARSEMEM */

    /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
    /* zone中开始页面的页帧号*/
    unsigned long       zone_start_pfn;

    /*
     * spanned_pages is the total pages spanned by the zone, including
     * holes, which is calculated as:
     *  spanned_pages = zone_end_pfn - zone_start_pfn;
     *
     * present_pages is physical pages existing within the zone, which
     * is calculated as:
     *  present_pages = spanned_pages - absent_pages(pages in holes);
     *
     * managed_pages is present pages managed by the buddy system, which
     * is calculated as (reserved_pages includes pages allocated by the
     * bootmem allocator):
     *  managed_pages = present_pages - reserved_pages;
     *
     * So present_pages may be used by memory hotplug or memory power
     * management logic to figure out unmanaged pages by checking
     * (present_pages - managed_pages). And managed_pages should be used
     * by page allocator and vm scanner to calculate all kinds of watermarks
     * and thresholds.
     *
     * Locking rules:
     *
     * zone_start_pfn and spanned_pages are protected by span_seqlock.
     * It is a seqlock because it has to be read outside of zone->lock,
     * and it is done in the main allocator path.  But, it is written
     * quite infrequently.
     *
     * The span_seq lock is declared along with zone->lock because it is
     * frequently read in proximity to zone->lock.  It's good to
     * give them a chance of being in the same cacheline.
     *
     * Write access to present_pages at runtime should be protected by
     * mem_hotplug_begin/end(). Any reader who can't tolerant drift of
     * present_pages should get_online_mems() to get a stable value.
     *
     * Read access to managed_pages should be safe because it's unsigned
     * long. Write access to zone->managed_pages and totalram_pages are
     * protected by managed_page_count_lock at runtime. Idealy only
     * adjust_managed_page_count() should be used instead of directly
     * touching zone->managed_pages and totalram_pages.
     */
    /* zone中被伙伴系统管理的页面数量*/
    unsigned long       managed_pages;
    /*zone中包含的页面数量*/
    unsigned long       spanned_pages;
    // zone中实际管理的页面数量,对一些体系结构来说,其值和spanned_pages相等
    unsigned long       present_pages;

    const char      *name;

#ifdef CONFIG_MEMORY_ISOLATION
    /*
     * Number of isolated pageblock. It is used to solve incorrect
     * freepage counting problem due to racy retrieving migratetype
     * of pageblock. Protected by zone->lock.
     */
    unsigned long       nr_isolate_pageblock;
#endif

#ifdef CONFIG_MEMORY_HOTPLUG
    /* see spanned/present_pages for more description */
    seqlock_t       span_seqlock;
#endif

    int initialized;

    /* Write-intensive fields used from the page allocator */
    ZONE_PADDING(_pad1_)

    /* free areas of different sizes */
    // 管理空闲区域的数组,包含管理链表等
    struct free_area    free_area[MAX_ORDER];

    /* zone flags, see below */
    unsigned long       flags;

    /* Primarily protects free_area */
    // 并行访问时用于对zone保护的自旋锁.
    spinlock_t      lock;

    /* Write-intensive fields used by compaction and vmstats. */
    ZONE_PADDING(_pad2_)

    /*
     * When free pages are below this point, additional steps are taken
     * when reading the number of free pages to avoid per-cpu counter
     * drift allowing watermarks to be breached
     */
    unsigned long percpu_drift_mark;

#if defined CONFIG_COMPACTION || defined CONFIG_CMA
    /* pfn where compaction free scanner should start */
    unsigned long       compact_cached_free_pfn;
    /* pfn where async and sync compaction migration scanner should start */
    unsigned long       compact_cached_migrate_pfn[2];
#endif

#ifdef CONFIG_COMPACTION
    /*
     * On compaction failure, 1<<compact_defer_shift compactions
     * are skipped before trying again. The number attempted since
     * last failure is tracked with compact_considered.
     */
    unsigned int        compact_considered;
    unsigned int        compact_defer_shift;
    int         compact_order_failed;
#endif

#if defined CONFIG_COMPACTION || defined CONFIG_CMA
    /* Set to true when the PG_migrate_skip bits should be cleared */
    bool            compact_blockskip_flush;
#endif

    bool            contiguous;

    ZONE_PADDING(_pad3_)
    /* Zone statistics */
    atomic_long_t       vm_stat[NR_VM_ZONE_STAT_ITEMS];
    atomic_long_t       vm_numa_stat[NR_VM_NUMA_STAT_ITEMS];
} ____cacheline_internodealigned_in_smp;

 page

页帧(page frame)代表了系统内存的最小单位, 堆内存中的每个页都会创建一个struct page的一个实例. 

mm_types.h

/*
 * Each physical page in the system has a struct page associated with
 * it to keep track of whatever it is we are using the page for at the
 * moment. Note that we have no way to track which tasks are using
 * a page, though if it is a pagecache page, rmap structures can tell us
 * who is mapping it.
 *
 * The objects in struct page are organized in double word blocks in
 * order to allows us to use atomic double word operations on portions
 * of struct page. That is currently only used by slub but the arrangement
 * allows the use of atomic double word operations on the flags/mapping
 * and lru list pointers also.
 */
/*
 * flags存储了体系结构无关的标志,用于描述页的属性.
 * _count是一个使用的计数,表示内核中引用该页的次数,在其值达到0时,内核就知道page实例当前不适用,
 *      因此可以删除.如果其值大于0,该实例决不会从内存中删除
 * _mapcount表示在也表中有多少项指向该页
 * lru是一个表头,用于在各种链表上维护该页,以便将页按不同类别分组,最重要的类别是活动和不活动页
 * 内核可以将多个毗连的页合并为一个较大的复合页.*/
struct page {
    /* First double word block */
    /* 原子标志,有些情况下会异步更新*/
    unsigned long flags;        /* Atomic flags, some possibly
                     * updated asynchronously */
    union {
        /* 如果最低位为0,则指向inode address_space或为null,
         * 如果页映射为匿名内存,最低位置位1,而且该指针指向anon_vam对象*/
        struct address_space *mapping;  /* If low bit clear, points to
                         * inode address_space, or NULL.
                         * If page mapped as anonymous
                         * memory, low bit is set,  and
                         * it points to anon_vma object:
                         * see PAGE_MAPPING_ANON below.
                         */
        void *s_mem;            /* slab first object */
        atomic_t compound_mapcount; /* first tail page */
        /* page_deferred_list().next     -- second tail page */
    };

    /* Second double word */
    union {
        /* 在映射内偏移量*/
        pgoff_t index;      /* Our offset within mapping. */
        /* */
        void *freelist;     /* sl[aou]b first free object */
        /* page_deferred_list().prev    -- second tail page */
    };

    union {
#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
    defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
        /* Used for cmpxchg_double in slub */
        unsigned long counters;
#else
        /*
         * Keep _refcount separate from slub cmpxchg_double data.
         * As the rest of the double word is protected by slab_lock
         * but _refcount is not.
         */
        unsigned counters;
#endif
        struct {

            union {
                /*
                 * Count of ptes mapped in mms, to show when
                 * page is mapped & limit reverse map searches.
                 *
                 * Extra information about page type may be
                 * stored here for pages that are never mapped,
                 * in which case the value MUST BE <= -2.
                 * See page-flags.h for more details.
                 */
                /* 内存管理子系统中映射页表项计数*/
                atomic_t _mapcount;

                unsigned int active;        /* SLAB */
                struct {            /* SLUB */
                    /* 用于slub分配其:对象数目*/
                    unsigned inuse:16;
                    unsigned objects:15;
                    unsigned frozen:1;
                };
                int units;          /* SLOB */
            };
            /*
             * Usage count, *USE WRAPPER FUNCTION* when manual
             * accounting. See page_ref.h
             */
            atomic_t _refcount;
        };
    };

    /*
     * Third double word block
     *
     * WARNING: bit 0 of the first word encode PageTail(). That means
     * the rest users of the storage space MUST NOT use the bit to
     * avoid collision and false-positive PageTail().
     */
    union {
        struct list_head lru;   /* Pageout list, eg. active_list
                     * protected by zone_lru_lock !
                     * Can be used as a generic list
                     * by the page owner.
                     */
        struct dev_pagemap *pgmap; /* ZONE_DEVICE pages are never on an
                        * lru or handled by a slab
                        * allocator, this points to the
                        * hosting device page map.
                        */
        struct {        /* slub per cpu partial pages */
            struct page *next;  /* Next partial slab */
#ifdef CONFIG_64BIT
            int pages;  /* Nr of partial slabs left */
            int pobjects;   /* Approximate # of objects */
#else
            short int pages;
            short int pobjects;
#endif
        };

        struct rcu_head rcu_head;   /* Used by SLAB
                         * when destroying via RCU
                         */
        /* Tail pages of compound page */
        struct {
            unsigned long compound_head; /* If bit zero is set */

            /* First tail page only */
#ifdef CONFIG_64BIT
            /*
             * On 64 bit system we have enough space in struct page
             * to encode compound_dtor and compound_order with
             * unsigned int. It can help compiler generate better or
             * smaller code on some archtectures.
             */
            unsigned int compound_dtor;
            unsigned int compound_order;
#else
            unsigned short int compound_dtor;
            unsigned short int compound_order;
#endif
        };

#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS
        struct {
            unsigned long __pad;    /* do not overlay pmd_huge_pte
                         * with compound_head to avoid
                         * possible bit 0 collision.
                         */
            pgtable_t pmd_huge_pte; /* protected by page->ptl */
        };
#endif
    };

    /* Remainder is not double word aligned */
    union {
        /* 由映射私有,不透明数据:
         * 如果设置了pageprivate,通常用于buffer_heads
         * 如果设置了pageswapcache,则用于视频_entry_t
         * 如果设置了PG_buddy,则用于表示伙伴系统中的阶*/
        unsigned long private;      /* Mapping-private opaque data:
                         * usually used for buffer_heads
                         * if PagePrivate set; used for
                         * swp_entry_t if PageSwapCache;
                         * indicates order in the buddy
                         * system if PG_buddy is set.
                         */
#if USE_SPLIT_PTE_PTLOCKS
#if ALLOC_SPLIT_PTLOCKS
        spinlock_t *ptl;
#else
        spinlock_t ptl;
#endif
#endif
        struct kmem_cache *slab_cache;  /* SL[AU]B: Pointer to slab */
    };

#ifdef CONFIG_MEMCG
    struct mem_cgroup *mem_cgroup;
#endif

    /*
     * On machines where all RAM is mapped into kernel address space,
     * we can simply calculate the virtual address. On machines with
     * highmem some memory is mapped into kernel virtual memory
     * dynamically, so we need a place to store that address.
     * Note that this field could be 16 bits on x86 ... ;)
     *
     * Architectures with slow multiplication can define
     * WANT_PAGE_VIRTUAL in asm/page.h
     */
#if defined(WANT_PAGE_VIRTUAL)
    void *virtual;          /* Kernel virtual address (NULL if
                       not kmapped, ie. highmem) */
#endif /* WANT_PAGE_VIRTUAL */

#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
    int _last_cpupid;
#endif
}

页表

1557039774506

1557039937477

物理内存初始化

memblock

memblock用于开机阶段的内存管理.

页表建立

ARM32 页表映射

ARM64 页表映射

内存布局

ARM32 内存布局

ARM64 内存布局

物内存分配

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!