Spark源码解读之Worker剖析

在上一篇中我们剖析了Master的工作原理，这节我们接着来剖析Worker的工作员原理，Worker主要包括两部分的工作，启动Executor和启动Driver，然后向Master发送注册启动消息。

下面是Worker的工作流程图：

在Application向Master注册之后，Master会发出命令启动Wroker，在Worker节点启动之后，它会调动内部的两个方法LaunchDriver和LaunchExecutor分别启动Driver和Executor，它们的启动过程大致相同，都是先创建一个DriverRunnner或者ExecutorRunner来封装Driver或者Executor的信息来启动Driver或者Executor，先创建它们各自的工作目录，然后会调用内部的start的方法启动，在start方法中会创建ProcessBuilder来启动进程，最后它会等待Driver或者Executor工作完成退出，最后向它们各自所在的Worker节点发送DriverStateChanged消息，接着Worker向Master发送DriverStateChanged消息。

首先来看看Driver启动的LaunchDriver方法的源码：

/**       * 启动Driver       */     case LaunchDriver(driverId, driverDesc) => {       logInfo(s"Asked to launch driver $driverId")       //将Driver的信息封装在一个线程中启动       val driver = new DriverRunner(         conf,         driverId,         workDir,         sparkHome,         driverDesc.copy(command = Worker.maybeUpdateSSLSettings(driverDesc.command, conf)),         self,         akkaUrl)       //将Driver放入缓存中（HashMap中）       drivers(driverId) = driver       //启动Driver       driver.start()       //使用的CPU核数加上Driver使用的核数       coresUsed += driverDesc.cores       //使用的内存加上Driver的内存       memoryUsed += driverDesc.mem     }

内部的start启动方法：

 /** Starts a thread to run and manage the driver. */   def start() = {     //启动一个java线程     new Thread("DriverRunner for " + driverId) {       override def run() {         try {           //创建Driver的工作目录           val driverDir = createWorkingDirectory()           //加载启动Spark需要的jar包           val localJarFilename = downloadUserJar(driverDir)            def substituteVariables(argument: String): String = argument match {             case "{{WORKER_URL}}" => workerUrl             case "{{USER_JAR}}" => localJarFilename             case other => other           }            //创建ProcessBuilder，传入driverDesc和driver需要的内存大小的参数等等           // TODO: If we add ability to submit multiple jars they should also be added here           val builder = CommandUtils.buildProcessBuilder(driverDesc.command, driverDesc.mem,             sparkHome.getAbsolutePath, substituteVariables)           //启动Driver进行成           launchDriver(builder, driverDir, driverDesc.supervise)         }         catch {           case e: Exception => finalException = Some(e)         }          val state =           if (killed) {             DriverState.KILLED           } else if (finalException.isDefined) {             DriverState.ERROR           } else {             finalExitCode match {               case Some(0) => DriverState.FINISHED               case _ => DriverState.FAILED             }           }          finalState = Some(state)         //向Driver所在的Worker发送DriverStateChanged         worker ! DriverStateChanged(driverId, state, finalException)       }     }.start()   }

/***     * 启动Driver进程     * @param builder     * @param baseDir     * @param supervise     */   private def launchDriver(builder: ProcessBuilder, baseDir: File, supervise: Boolean) {     builder.directory(baseDir)     def initialize(process: Process) = {       //重定向stdout和stderr到文件中（将输出的日志和报错的日志写入磁盘文件中）       // Redirect stdout and stderr to files       val stdout = new File(baseDir, "stdout")       CommandUtils.redirectStream(process.getInputStream, stdout)        val stderr = new File(baseDir, "stderr")       val header = "Launch Command: %s\n%s\n\n".format(         builder.command.mkString("\"", "\" \"", "\""), "=" * 40)       Files.append(header, stderr, UTF_8)       CommandUtils.redirectStream(process.getErrorStream, stderr)     }     runCommandWithRetry(ProcessBuilderLike(builder), initialize, supervise)   }

  case DriverStateChanged(driverId, state, exception) => {       state match {         case DriverState.ERROR =>           logWarning(s"Driver $driverId failed with unrecoverable exception: ${exception.get}")         case DriverState.FAILED =>           logWarning(s"Driver $driverId exited with failure")         case DriverState.FINISHED =>           logInfo(s"Driver $driverId exited successfully")         case DriverState.KILLED =>           logInfo(s"Driver $driverId was killed by user")         case _ =>           logDebug(s"Driver $driverId changed state to $state")       }       //在Driver向Worker发来DriverStateChanged之后       //Worker向Driver发送DriverStateChanged消息       master ! DriverStateChanged(driverId, state, exception)       //Driver工作完成之后从缓存中移除该Driver       val driver = drivers.remove(driverId).get       finishedDrivers(driverId) = driver       memoryUsed -= driver.driverDesc.mem       coresUsed -= driver.driverDesc.cores     }

Worker类内部的LaunchExecutor方法源码：

 /**       * 启动Executor进程       */     case LaunchExecutor(masterUrl, appId, execId, appDesc, cores_, memory_) =>       if (masterUrl != activeMasterUrl) {         logWarning("Invalid Master (" + masterUrl + ") attempted to launch executor.")       } else {         try {           logInfo("Asked to launch executor %s/%d for %s".format(appId, execId, appDesc.name))            //创建Executor工作目录           // Create the executor's working directory           val executorDir = new File(workDir, appId + "/" + execId)           if (!executorDir.mkdirs()) {             throw new IOException("Failed to create directory " + executorDir)           }            // Create local dirs for the executor. These are passed to the executor via the           // SPARK_LOCAL_DIRS environment variable, and deleted by the Worker when the           // application finishes.           /**             * 创建executor的本地目录，它们可以通过SPARK_LOCAL_DIRS环境变量来指定，并且在application完成的时候被Worker删除             */           val appLocalDirs = appDirectories.get(appId).getOrElse {             Utils.getOrCreateLocalRootDirs(conf).map { dir =>               Utils.createDirectory(dir).getAbsolutePath()             }.toSeq           }           appDirectories(appId) = appLocalDirs           //启动Executor线程来启动Executor           val manager = new ExecutorRunner(             appId,             execId,             appDesc.copy(command = Worker.maybeUpdateSSLSettings(appDesc.command, conf)),             cores_,             memory_,             self,             workerId,             host,             webUi.boundPort,             publicAddress,             sparkHome,             executorDir,             akkaUrl,             conf,             appLocalDirs, ExecutorState.LOADING)           executors(appId + "/" + execId) = manager           //启动executor           manager.start()           //CPU核数加上executor使用的核数           coresUsed += cores_           //使用的内存加上executor使用的内存大小           memoryUsed += memory_           //executor向worker发送ExecutorStateChanged的状态改变消息           //worker又向Driver发送ExecutorStateChanged状态改变的消息           master ! ExecutorStateChanged(appId, execId, manager.state, None, None)         } catch {           case e: Exception => {             logError(s"Failed to launch executor $appId/$execId for ${appDesc.name}.", e)             if (executors.contains(appId + "/" + execId)) {               executors(appId + "/" + execId).kill()               executors -= appId + "/" + execId             }             master ! ExecutorStateChanged(appId, execId, ExecutorState.FAILED,               Some(e.toString), None)           }         }       }

 /**     * 启动Executor进程     */   def start() {     //创建JAVA线程     workerThread = new Thread("ExecutorRunner for " + fullId) {       override def run() { fetchAndRunExecutor() }     }     workerThread.start()     // Shutdown hook that kills actors on shutdown.     shutdownHook = new Thread() {       override def run() {         killProcess(Some("Worker shutting down"))       }     }     Runtime.getRuntime.addShutdownHook(shutdownHook)   }

 /**    * Download and run the executor described in our ApplicationDescription    */   def fetchAndRunExecutor() {     try {       //创建ProcessBuilder       // Launch the process       val builder = CommandUtils.buildProcessBuilder(appDesc.command, memory,         sparkHome.getAbsolutePath, substituteVariables)       val command = builder.command()       logInfo("Launch command: " + command.mkString("\"", "\" \"", "\""))        builder.directory(executorDir)       builder.environment.put("SPARK_LOCAL_DIRS", appLocalDirs.mkString(","))       // In case we are running this from within the Spark Shell, avoid creating a "scala"       // parent process for the executor command       builder.environment.put("SPARK_LAUNCH_WITH_SCALA", "0")        // Add webUI log urls       val baseUrl =         s"http://$publicAddress:$webUiPort/logPage/?appId=$appId&executorId=$execId&logType="       builder.environment.put("SPARK_LOG_URL_STDERR", s"${baseUrl}stderr")       builder.environment.put("SPARK_LOG_URL_STDOUT", s"${baseUrl}stdout")        process = builder.start()       val header = "Spark Executor Command: %s\n%s\n\n".format(         command.mkString("\"", "\" \"", "\""), "=" * 40)       //重定向stdout和stderr到文件中       // Redirect its stdout and stderr to files       val stdout = new File(executorDir, "stdout")       stdoutAppender = FileAppender(process.getInputStream, stdout, conf)        val stderr = new File(executorDir, "stderr")       Files.write(header, stderr, UTF_8)       stderrAppender = FileAppender(process.getErrorStream, stderr, conf)        // Wait for it to exit; executor may exit with code 0 (when driver instructs it to shutdown)       // or with nonzero exit code       //等待executor完成任务退出       val exitCode = process.waitFor()       state = ExecutorState.EXITED       val message = "Command exited with code " + exitCode       //向executor所在的Worker节点发送ExecutorStateChanged消息       worker ! ExecutorStateChanged(appId, execId, state, Some(message), Some(exitCode))     } catch {       case interrupted: InterruptedException => {         logInfo("Runner thread for executor " + fullId + " interrupted")         state = ExecutorState.KILLED         killProcess(None)       }       case e: Exception => {         logError("Error running executor", e)         state = ExecutorState.FAILED         killProcess(Some(e.toString))       }     }   }

至此我们就剖析完了Worker节点的工作流程，如有任何问题，希望不吝赐教！！！

文章来源: Spark源码解读之Worker剖析

标签

executor