linux smp 引导多核源码分析

走远了吗. 提交于 2020-01-22 17:04:19

1 SMP cpu map和操作函数集初始化

1.1 初步初始化cpu map

start_kernel

    ---------->smp_setup_processor_id

linux 从第一个cpu上面启动以后,先利用smp_setup_processor_id设置cpu index

void __init smp_setup_processor_id(void)
{

	int i;
//判断是否是smp系统,是则从arm协处理器读取当前cpuid,否则为0
	u32 mpidr = is_smp() ? read_cpuid_mpidr() & MPIDR_HWID_BITMASK : 0;
//根据level确定CPU号,即cpu = (mpidr >> 8) & 0xff;
	u32 cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
//设置map 0为引导cpu
	cpu_logical_map(0) = cpu;
//nr_cpu_ids表示系统中CPU总数
	for (i = 1; i < nr_cpu_ids; ++i)
		cpu_logical_map(i) = i == cpu ? 0 : i; //初始化除boot cpu以外的其他cpu的map

	printk(KERN_INFO "Booting Linux on physical CPU 0x%x\n", mpidr);
}

is_smp的判断比较简单:

static inline bool is_smp(void)
{
#ifndef CONFIG_SMP
	return false;
#elif defined(CONFIG_SMP_ON_UP)
	extern unsigned int smp_on_up;
	return !!smp_on_up;
#else
	return true;
#endif
}

read_cpuid_mpidr读取CPUID_MPIDR寄存器获取cpu的值,比较简单,这边不展开。

1.2 利用device tree再次初始化cpu map,根据不同的machine 设置smp 函数集

先看一下device tree一般是如何描述cpu节点的:

cpus {
		enable-method = "allwinner,sun8i-a23";
		#address-cells = <1>;
		#size-cells = <0>;

		cpu0: cpu@0 {
			compatible = "arm,cortex-a7";
			device_type = "cpu";
			reg = <0>;
		};

		cpu@1 {
			compatible = "arm,cortex-a7";
			device_type = "cpu";
			reg = <1>;
		};
	};

接着再看一下内核如何利用device tree初始化相关信息

start_kernel

    ------------>setup_arch

          -------------->arm_dt_init_cpu_maps

void __init arm_dt_init_cpu_maps(void)
{
	/*
	 * Temp logical map is initialized with UINT_MAX values that are
	 * considered invalid logical map entries since the logical map must
	 * contain a list of MPIDR[23:0] values where MPIDR[31:24] must
	 * read as 0.
	 */
	struct device_node *cpu, *cpus;
	u32 i, j, cpuidx = 1;
	u32 mpidr = is_smp() ? read_cpuid_mpidr() & MPIDR_HWID_BITMASK : 0;

	u32 tmp_map[NR_CPUS] = { [0 ... NR_CPUS-1] = MPIDR_INVALID };
	bool bootcpu_valid = false;
//查找cpu根节点
	cpus = of_find_node_by_path("/cpus");

	if (!cpus)
		return;
//遍历 cpus 所有的child node
	for_each_child_of_node(cpus, cpu) {
		u32 hwid;

		if (of_node_cmp(cpu->type, "cpu"))
			continue;

		pr_debug(" * %s...\n", cpu->full_name);
		/*
		 * A device tree containing CPU nodes with missing "reg"
		 * properties is considered invalid to build the
		 * cpu_logical_map.
		 */
//读取reg属性的值,并赋值给hwid,一般reg中存放的就是cpu index值
		if (of_property_read_u32(cpu, "reg", &hwid)) {
			pr_debug(" * %s missing reg property\n",
				     cpu->full_name);
			return;
		}

		/*
		 * 8 MSBs must be set to 0 in the DT since the reg property
		 * defines the MPIDR[23:0].
		 */
//reg的属性值bit:24必须设置为0,这是Arm CPU binding定义的
		if (hwid & ~MPIDR_HWID_BITMASK)
			return;

		/*
		 * Duplicate MPIDRs are a recipe for disaster.
		 * Scan all initialized entries and check for
		 * duplicates. If any is found just bail out.
		 * temp values were initialized to UINT_MAX
		 * to avoid matching valid MPIDR[23:0] values.
		 */
		for (j = 0; j < cpuidx; j++)
			if (WARN(tmp_map[j] == hwid, "Duplicate /cpu reg "
						     "properties in the DT\n"))
				return;

		/*
		 * Build a stashed array of MPIDR values. Numbering scheme
		 * requires that if detected the boot CPU must be assigned
		 * logical id 0. Other CPUs get sequential indexes starting
		 * from 1. If a CPU node with a reg property matching the
		 * boot CPU MPIDR is detected, this is recorded so that the
		 * logical map built from DT is validated and can be used
		 * to override the map created in smp_setup_processor_id().
		 */
		if (hwid == mpidr) {
			i = 0;      //说明是boot cpu
			bootcpu_valid = true;
		} else {
			i = cpuidx++;
		}

		if (WARN(cpuidx > nr_cpu_ids, "DT /cpu %u nodes greater than "
					       "max cores %u, capping them\n",
					       cpuidx, nr_cpu_ids)) {
			cpuidx = nr_cpu_ids;
			break;
		}
// 数组tmp_map保存了系统中所有CPU的MPIDR值(CPU ID值),
// 具体的index的编码规则是: tmp_map[0]保存了booting CPU的id值,
// 其余的CPU的ID值保存在1~NR_CPUS的位置
		tmp_map[i] = hwid;
	}

	if (!bootcpu_valid) {
		pr_warn("DT missing boot CPU MPIDR[23:0], fall back to default cpu_logical_map\n");
		return;
	}

	/*
	 * Since the boot CPU node contains proper data, and all nodes have
	 * a reg property, the DT CPU list can be considered valid and the
	 * logical map created in smp_setup_processor_id() can be overridden
	 */
	for (i = 0; i < cpuidx; i++) {
		set_cpu_possible(i, true);//配置系统可运行的CPU核心的数量
		cpu_logical_map(i) = tmp_map[i];//重新初始化cpu_logical_map
		pr_debug("cpu logical map 0x%x\n", cpu_logical_map(i));
	}
}

执行完上面函数,接着在setup_arch中往下执行,初始化smp 操作集函数:

#ifdef CONFIG_SMP
	if (is_smp()) {
		smp_set_ops(mdesc->smp);
		smp_init_cpus();
	}
#endif

smp_set_ops设置操作函数集:

static struct smp_operations smp_ops;
void __init smp_set_ops(struct smp_operations *ops)
{
	if (ops)
		smp_ops = *ops;
};
void __init smp_init_cpus(void)
{
	if (smp_ops.smp_init_cpus) //调用操作集函数的初始化函数
		smp_ops.smp_init_cpus();
}

传入的参数由mdesc->smp给出,先看一下系统是如何定义smp_operations的:

struct smp_operations {
#ifdef CONFIG_SMP
	/*
	 * Setup the set of possible CPUs (via set_cpu_possible)
	 */
//初始化系统可运行的cpu
	void (*smp_init_cpus)(void);
	/*
	 * Initialize cpu_possible map, and enable coherency
	 */
	void (*smp_prepare_cpus)(unsigned int max_cpus);

	/*
	 * Perform platform specific initialisation of the specified CPU.
	 */
 //引导slave cpu的初始化
	void (*smp_secondary_init)(unsigned int cpu);
	/*
	 * Boot a secondary CPU, and assign it the specified idle task.
	 * This also gives us the initial stack to use for this CPU.
	 */
 //启动slave CPU
	int  (*smp_boot_secondary)(unsigned int cpu, struct task_struct *idle);
#ifdef CONFIG_HOTPLUG_CPU
	int  (*cpu_kill)(unsigned int cpu);
	void (*cpu_die)(unsigned int cpu);
	int  (*cpu_disable)(unsigned int cpu);
#endif
#endif
};

在定义某个machine的时候,如果是smp处理器,一般我们如下来描述某个machine:

DT_MACHINE_START(VEXPRESS_DT, "ARM-Versatile Express")
	.dt_compat	= v2m_dt_match,
	.l2c_aux_val	= 0x00400000,
	.l2c_aux_mask	= 0xfe0fffff,
	.smp		= vexpress_smp_dt_ops,     //设置smp操作集函数
MACHINE_END

const struct smp_operations vexpress_smp_dt_ops __initconst = {
.smp_prepare_cpus	= vexpress_smp_dt_prepare_cpus,
.smp_secondary_init	= versatile_secondary_init,
.smp_boot_secondary	= versatile_boot_secondary,
.smp_init_cpus = vexpress_smp_init_ops,
#ifdef CONFIG_HOTPLUG_CPU
.cpu_die		= vexpress_cpu_die,
#endif
};

2 smp多核引导过程

start_kernel

      ----------->rest_init

             ------------>kernel_init

                    ---------------->kernel_init_freeable

                           ------------------>smp_prepare_cpus

void __init smp_prepare_cpus(unsigned int max_cpus)
{
	unsigned int ncores = num_possible_cpus();//获取系统可使用的CPU数量

	init_cpu_topology();

	smp_store_cpu_info(smp_processor_id());

	/*
	 * are we trying to boot more cores than exist?
	 */
	if (max_cpus > ncores)
		max_cpus = ncores;
	if (ncores > 1 && max_cpus) {
		/*
		 * Enable the local timer or broadcast device for the
		 * boot CPU, but only if we have more than one CPU.
		 */
		percpu_timer_setup();

		/*
		 * Initialise the present map, which describes the set of CPUs
		 * actually populated at the present time. A platform should
		 * re-initialize the map in the platforms smp_prepare_cpus()
		 * if present != possible (e.g. physical hotplug).
		 */
		init_cpu_present(cpu_possible_mask);

		/*
		 * Initialise the SCU if there are more than one CPU
		 * and let them know where to start.
		 */
// 调用smp_operations 中的smp_prepare_cpus成员函数,
// 为wake_up CPU做准备,这个是具体machine相关的函数,这边不做分析
		if (smp_ops.smp_prepare_cpus)
			smp_ops.smp_prepare_cpus(max_cpus);
	}
}

kernel_init_freeable

            ------------------>smp_init

void __init smp_init(void)
{
	unsigned int cpu;

	idle_threads_init(); //空闲线程初始化,每个cpu都有一个空闲线程,为每个cpu都初始化一个空闲线程

	/* FIXME: This should be done in userspace --RR */
	for_each_present_cpu(cpu) {
		if (num_online_cpus() >= setup_max_cpus)
			break;
		if (!cpu_online(cpu))
			cpu_up(cpu);//wake_up所有的非online的CPU
	}

	/* Any cleanup work */
	printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus());
	smp_cpus_done(setup_max_cpus);
}

在cpu_up中,就要开始引导其他cpu核启动了

cpu_up

    ------------->_cpu_up

             ------------------>__cpu_up

int __cpuinit __cpu_up(unsigned int cpu, struct task_struct *idle)
{
	int ret;

	/*
	 * We need to tell the secondary core where to find
	 * its stack and the page tables.
	 */
	secondary_data.stack = task_stack_page(idle) + THREAD_START_SP;//设置要引导cpu的栈地址,和其idle进程的进程描述符在同一块内存中
	secondary_data.pgdir = virt_to_phys(idmap_pgd);//设置其页目录基地址,该页目录是一份拷贝
	secondary_data.swapper_pg_dir = virt_to_phys(swapper_pg_dir);//设置页目录基址,该页目录和就是boot cpu的页目录,这个变量设置了好像后面并没有使用
	__cpuc_flush_dcache_area(&secondary_data, sizeof(secondary_data));
	outer_clean_range(__pa(&secondary_data), __pa(&secondary_data + 1));

	/*
	 * Now bring the CPU into our world.
	 */
	ret = boot_secondary(cpu, idle); //在该函数中引导其他的cpu
	if (ret == 0) {
		/*
		 * CPU was successfully started, wait for it
		 * to come online or time out.
		 */
		wait_for_completion_timeout(&cpu_running,
						 msecs_to_jiffies(1000)); //boot cpu在这边等待引导完成,然后继续其他的工作

		if (!cpu_online(cpu)) {
			pr_crit("CPU%u: failed to come online\n", cpu);
			ret = -EIO;
		}
	} else {
		pr_err("CPU%u: failed to boot: %d\n", cpu, ret);
	}

	secondary_data.stack = NULL;
	secondary_data.pgdir = 0;

	return ret;
}

先看一下idmap_pgd这个页目录是如何拷贝出来的:

static int __init init_static_idmap(void)
{
	idmap_pgd = pgd_alloc(&init_mm);//复制boot cpu init进程的内核页目录
	if (!idmap_pgd)
		return -ENOMEM;

	pr_info("Setting up static identity map for 0x%p - 0x%p\n",
		__idmap_text_start, __idmap_text_end);
	identity_mapping_add(idmap_pgd, __idmap_text_start,
			     __idmap_text_end, 0);

	/* Flush L1 for the hardware to see this page table content */
	flush_cache_louis();

	return 0;
}
early_initcall(init_static_idmap);

可以看到主要是利用了pgd_alloc函数完成页目录的复制,具体的复制分析,可以参考这篇文章:

https://blog.csdn.net/oqqYuJi12345678/article/details/102828714

再看一下boot_secondary函数:

int __cpuinit boot_secondary(unsigned int cpu, struct task_struct *idle)
{
	if (smp_ops.smp_boot_secondary)
		return smp_ops.smp_boot_secondary(cpu, idle);
	return -ENOSYS;
}

smp_boot_secondary是具体machine相关的函数,为了更容易说明问题,举一个厂商的machine进行分析:

static int sun8i_smp_boot_secondary(unsigned int cpu,
				    struct task_struct *idle)
{
	u32 reg;

	if (!(prcm_membase && cpucfg_membase))
		return -EFAULT;

	spin_lock(&cpu_lock);

	/* Set CPU boot address */
    //配置bringup_cpu的入口地址
	writel(__pa_symbol(secondary_startup),
	       cpucfg_membase + CPUCFG_PRIVATE0_REG);

	/* Assert the CPU core in reset */
	writel(0, cpucfg_membase + CPUCFG_CPU_RST_CTRL_REG(cpu));

	/* Assert the L1 cache in reset */
	reg = readl(cpucfg_membase + CPUCFG_GEN_CTRL_REG);
	writel(reg & ~BIT(cpu), cpucfg_membase + CPUCFG_GEN_CTRL_REG);

	/* Clear CPU power-off gating */
	reg = readl(prcm_membase + PRCM_CPU_PWROFF_REG);
	writel(reg & ~BIT(cpu), prcm_membase + PRCM_CPU_PWROFF_REG);
	mdelay(1);

	/* Deassert the CPU core reset */
	writel(3, cpucfg_membase + CPUCFG_CPU_RST_CTRL_REG(cpu));

	spin_unlock(&cpu_lock);

	return 0;
}

通过上面设置,被引导的cpu就从secondary_startup地址开始执行

3 smp 多核处理器启动过程

除了boot cpu以后,其他cpu均从secondary_startup代码开始执行,注意下面所说的编译地址,即linux内核正常启动以后运行的虚拟地址,该地址由程序链接时决定,和物理地址无关

#if defined(CONFIG_SMP)
	__CPUINIT
ENTRY(secondary_startup)
	/*
	 * Common entry point for secondary CPUs.
	 *
	 * Ensure that we're in SVC mode, and IRQs are disabled.  Lookup
	 * the processor type - there is no need to check the machine type
	 * as it has already been validated by the primary processor.
	 */
#ifdef CONFIG_ARM_VIRT_EXT
	bl	__hyp_stub_install_secondary
#endif
	safe_svcmode_maskall r9
-------------------------------------------------------(1)
	mrc	p15, 0, r9, c0, c0		@ get processor id
	bl	__lookup_processor_type
	movs	r10, r5				@ invalid processor?
	moveq	r0, #'p'			@ yes, error 'p'
 THUMB( it	eq )		@ force fixup-able long branch encoding
	beq	__error_p

	/*
	 * Use the page tables supplied from  __cpu_up.
	 */
	adr	r4, __secondary_data  //获取__secondary_data的地址,__secondary_data的链接地址肯定是个虚拟地址,由于mmu还没有开启,所以使用相对取址获取其真实地址
//加载该地址处的三个变量的值,分别是__secondary_data的编译地址,secondary_data的编译地址以及__secondary_switched的编译地址
	ldmia	r4, {r5, r7, r12}		@ address to jump to after
//r4通过相对地址寻址得到的是__secondary_data的物理地址,和其编译地址做差值得到物理地址和编译地址之间的差值,用以下面根据编译地址得到物理地址
	sub	lr, r4, r5			@ mmu has been enabled
//得到secondary_data的物理地址,就是secondary_data.pgdir的物理地址
	ldr	r4, [r7, lr]			@ get secondary_data.pgdir
	add	r7, r7, #4  //加4偏移,取下一个变量的物理地址
//得到secondary_data.swapper_pg_dir的物理地址
	ldr	r8, [r7, lr]			@ get secondary_data.swapper_pg_dir

//把__enable_mmu 作为返回地址
	adr	lr, BSYM(__enable_mmu)		@ return address

把__secondary_switched的编译地址直接赋值给r13,后面使用该地址的时候已经开启mmu,所以直接使用虚拟地址没有问题
	mov	r13, r12			@ __secondary_switched address

-------------------------------------------------------------(2)
//跳转到具体体系架构相关的initialise processor函数处理
 ARM(	add	pc, r10, #PROCINFO_INITFUNC	) @ initialise processor
						  @ (return control reg)
 THUMB(	add	r12, r10, #PROCINFO_INITFUNC	)
 THUMB(	mov	pc, r12				)
ENDPROC(secondary_startup)

	/*
	 * r6  = &secondary_data
	 */
ENTRY(__secondary_switched)
	ldr	sp, [r7, #4]			@ get secondary_data.stack
	mov	fp, #0
	b	secondary_start_kernel
ENDPROC(__secondary_switched)

	.align

	.type	__secondary_data, %object
__secondary_data:
	.long	.
	.long	secondary_data
	.long	__secondary_switched
#endif /* defined(CONFIG_SMP) */

(1)

mrc    p15, 0, r9, c0, c0        @ get processor id
bl    __lookup_processor_type   //根据该cpu id,查找process info list
movs    r10, r5                @ invalid processor?  //并把找到的process info 起始地址放入r10

更详细的说明,参考这篇文章:

https://blog.csdn.net/oqqYuJi12345678/article/details/99654760

(2)前面__lookup_processor_type   函数找到对应的处理器process info,该结构偏移PROCINFO_INITFUNC  以后,存储的是其对应的体系架构相关的处理函数,再上面那篇文章中页可以找到详细说明。从初始化函数返回一般是如下代码:

mov	pc, lr

从设置的lr地址处接着执行,即跳转到__enable_mmu代码处,执行mmu的enable工作,分别是__enable_mmu和__turn_mmu_on函数,具体实现在上面那篇文章中也有详细说明,不过这边需要注意的是,开启mmu的那段代码需要做恒等映射,使得开启mmu前后,代码不会出错,在boot cpu起来得时候,我们已经看到创建页表得时候,进行了恒等映射相关得工作,感兴趣得可以参照这篇文章:

https://blog.csdn.net/oqqYuJi12345678/article/details/96029177

bootcpu跳转到c代码处执行以后,会首先对user空间得页表进行处理,我们使用得物理地址起始地址为0x30000000,开启mmu以后该空间处在用户空间,所以会被清理掉,该恒等映射页表不再存在,那么这边引导多核,是在哪完成恒等映射得呢。

其实在上面得init_static_idmap函数中,为多核处理器复制内核页表得时候以后,就为其做了恒等映射:

static int __init init_static_idmap(void)
{
	。。。。。。。。。。。。。
//恒等映射相关的代码
	identity_mapping_add(idmap_pgd, __idmap_text_start,
			     __idmap_text_end, 0);
。。。。。。。。。。。。。。。。。
}

对__idmap_text_start和__idmap_text_end之间得代码做了恒等映射,那么这里面得代码具体是在哪边,可以看一下链接脚本是如何处理得,在arch/arm/kernel/vmlinux.lds.S中:

	VMLINUX_SYMBOL(__idmap_text_start) = .;				\
	*(.idmap.text)							\
	VMLINUX_SYMBOL(__idmap_text_end) = .;				\

该段代码就是放在.idmap.text段中得代码,再看一下开启mmu得那个函数__turn_mmu_on是如何定义得:

	.align	5
	.pushsection	.idmap.text, "ax"
ENTRY(__turn_mmu_on)
	mov	r0, r0
	instr_sync
	mcr	p15, 0, r0, c1, c0, 0		@ write control reg
	mrc	p15, 0, r3, c0, c0, 0		@ read id reg
	instr_sync
	mov	r3, r3
	mov	r3, r13
	mov	pc, r3
__turn_mmu_on_end:
ENDPROC(__turn_mmu_on)
	.popsection

可以看到该代码就在__turn_mmu_on中,至此真相大白。开启mmu以后,从r13寄存器取返回地址。上面r13寄存器存放得返回地址为__secondary_switched:

ENTRY(__secondary_switched)
	ldr	sp, [r7, #4]			@ get secondary_data.stack //先设置堆栈,为执行c函数做准备
	mov	fp, #0
	b	secondary_start_kernel //跳转到secondary_start_kernel代码处开始执行
ENDPROC(__secondary_switched)

最终跳转到c语言得入口函数secondary_start_kernel:

asmlinkage void __cpuinit secondary_start_kernel(void)
{
	struct mm_struct *mm = &init_mm;
	unsigned int cpu;

	/*
	 * The identity mapping is uncached (strongly ordered), so
	 * switch away from it before attempting any exclusive accesses.
	 */
---------------------------------------------------------(1)
	cpu_switch_mm(mm->pgd, mm);
	local_flush_bp_all();
	enter_lazy_tlb(mm, current);
	local_flush_tlb_all();

	/*
	 * All kernel threads share the same mm context; grab a
	 * reference and switch to it.
	 */
	cpu = smp_processor_id();
	atomic_inc(&mm->mm_count);
	current->active_mm = mm;
	cpumask_set_cpu(cpu, mm_cpumask(mm));

	cpu_init();//设置 异常处理堆栈

	printk("CPU%u: Booted secondary processor\n", cpu);

	preempt_disable();
	trace_hardirqs_off();

	/*
	 * Give the platform a chance to do its own initialisation.
	 */
	if (smp_ops.smp_secondary_init)
		smp_ops.smp_secondary_init(cpu);

	notify_cpu_starting(cpu);

	calibrate_delay(); //可能不同cpu得运行频率不同,所以重新计算delay时间

	smp_store_cpu_info(cpu);

	/*
	 * OK, now it's safe to let the boot CPU continue.  Wait for
	 * the CPU migration code to notice that the CPU is online
	 * before we continue - which happens after __cpu_up returns.
	 */
	set_cpu_online(cpu, true);
	complete(&cpu_running); //通知boot cpu启动成功

	/*
	 * Setup the percpu timer for this CPU.
	 */
	percpu_timer_setup();

	local_irq_enable();
	local_fiq_enable();

	/*
	 * OK, it's off to the idle thread for us
	 */
-----------------------------------------------------------(2)
	cpu_startup_entry(CPUHP_ONLINE);
}

(1)切换页表,和boot cpu使用相同的页目录,从这边可以看出,最终所有得smp cpu都使用同一份内核页表,具体切换过程得分析,可以参考这篇文章:

https://blog.csdn.net/oqqYuJi12345678/article/details/102758457

(2)处理idle相关得工作,更详细得说明可以参考这篇文章:

https://blog.csdn.net/oqqYuJi12345678/article/details/102876424

至此,多核处理器引导结束。

文中有些内容参考自该篇文章:

https://blog.csdn.net/u013836909/article/details/94204557

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!