spark-submit脚本分析

执行任务

./spark-submit \
--class cn.com.dtmobile.spark.DebugTest \
--master yarn \
--deploy-mode client \
--num-executors 3 \
--executor-cores 2 \
--executor-memory 1G \
/home/etluser/kong/debugTest/pucchSinr.jar

${SPARK_HOME}/bin/spark-submit脚本

if [ -z "${SPARK_HOME}" ]; then
  source "$(dirname "$0")"/find-spark-home  
fi

# disable randomized hash for string in Python 3.3+
export PYTHONHASHSEED=0

exec "${SPARK_HOME}"/bin/spark-class org.apache.spark.deploy.SparkSubmit "$@"

$@表示所有接收的参数：

$@=

--class cn.com.dtmobile.spark.DebugTest --master yarn --deploy-mode client --num-executors 3 --executor-cores 2 --executor-memory 1G /home/etluser/kong/debugTest/pucchSinr.jar

最后exec执行的内容：
exec=

/home/etluser/kong/spark/spark-2.3.4-bin/spark-2.3.4-bin-hadoop2.6/bin/spark-class org.apache.spark.deploy.SparkSubmit --class cn.com.dtmobile.spark.DebugTest --master yarn --deploy-mode client --num-executors 3 --executor-cores 2 --executor-memory 1G /home/etluser/kong/debugTest/pucchSinr.jar

所以传给spark-class的参数为：

org.apache.spark.deploy.SparkSubmit --class cn.com.dtmobile.spark.DebugTest --master yarn --deploy-mode client --num-executors 3 --executor-cores 2 --executor-memory 1G /home/etluser/kong/debugTest/pucchSinr.jar

${SPARK_HOME}/bin/spark-class脚本

if [ -z "${SPARK_HOME}" ]; then #如果${SPARK_HOME}长度为0
  source "$(dirname "$0")"/find-spark-home
fi

. "${SPARK_HOME}"/bin/load-spark-env.sh

# Find the java binary
if [ -n "${JAVA_HOME}" ]; then  #如果${JAVA_HOME}长度不为0
  RUNNER="${JAVA_HOME}/bin/java"
else
  if [ "$(command -v java)" ]; then
    RUNNER="java"
  else
    echo "JAVA_HOME is not set" >&2
    exit 1
  fi
fi

# Find Spark jars.
if [ -d "${SPARK_HOME}/jars" ]; then #如果${SPARK_HOME}/jars文件夹存在的话
  SPARK_JARS_DIR="${SPARK_HOME}/jars"
else
  SPARK_JARS_DIR="${SPARK_HOME}/assembly/target/scala-$SPARK_SCALA_VERSION/jars" #不存在的话就去assembly目录下找
fi

if [ ! -d "$SPARK_JARS_DIR" ] && [ -z "$SPARK_TESTING$SPARK_SQL_TESTING" ]; then #如果SPARK_JARS_DIR目录不存在，并且相关测试目录也为空
  echo "Failed to find Spark jars directory ($SPARK_JARS_DIR)." 1>&2
  echo "You need to build Spark with the target \"package\" before running this program." 1>&2
  exit 1
else
  LAUNCH_CLASSPATH="$SPARK_JARS_DIR/*" #指定加载时候的类路径为SPARK_JARS_DIR/所有jar包
fi

# Add the launcher build dir to the classpath if requested.
if [ -n "$SPARK_PREPEND_CLASSES" ]; then
  LAUNCH_CLASSPATH="${SPARK_HOME}/launcher/target/scala-$SPARK_SCALA_VERSION/classes:$LAUNCH_CLASSPATH"
fi

# For tests
if [[ -n "$SPARK_TESTING" ]]; then
  unset YARN_CONF_DIR
  unset HADOOP_CONF_DIR
fi

# The launcher library will print arguments separated by a NULL character, to allow arguments with
# characters that would be otherwise interpreted by the shell. Read that in a while loop, populating
# an array that will be used to exec the final command.
#
# The exit code of the launcher is appended to the output, so the parent shell removes it from the
# command array and checks the value to see if the launcher succeeded.
build_command() {
    #$RUNNER为java，调用类路径中的org.apache.spark.launcher.Main类 参数为spark-submit指定的所有参数，在这里调用launcher生成下面jvm command
  "$RUNNER" -Xmx128m -cp "$LAUNCH_CLASSPATH" org.apache.spark.launcher.Main "$@"
  printf "%d\0" $?
}

# Turn off posix mode since it does not allow process substitution
set +o posix
CMD=()
while IFS= read -d '' -r ARG; do
  CMD+=("$ARG")
done < <(build_command "$@")

COUNT=${#CMD[@]}
LAST=$((COUNT - 1))
LAUNCHER_EXIT_CODE=${CMD[$LAST]}

# Certain JVM failures result in errors being printed to stdout (instead of stderr), which causes
# the code that parses the output of the launcher to get confused. In those cases, check if the
# exit code is an integer, and if it's not, handle it as a special error case.
if ! [[ $LAUNCHER_EXIT_CODE =~ ^[0-9]+$ ]]; then
  echo "${CMD[@]}" | head -n-1 1>&2
  exit 1
fi

if [ $LAUNCHER_EXIT_CODE != 0 ]; then
  exit $LAUNCHER_EXIT_CODE
fi

CMD=("${CMD[@]:0:$LAST}")
exec "${CMD[@]}"

最后exec执行的CMD为：

${CMD[@]}=
/usr/lib/java/jdk1.8.0_144/bin/java -cp \
/home/etluser/kong/spark/spark-2.3.4-bin/spark-2.3.4-bin-hadoop2.6/conf/:/home/etluser/kong/spark/spark-2.3.4-bin/spark-2.3.4-bin-hadoop2.6/jars/* \
-Xmx1g \
org.apache.spark.deploy.SparkSubmit \
--master yarn \
--deploy-mode client \
--class cn.com.dtmobile.spark.DebugTest \
--num-executors 3 \
--executor-cores 2 \
--executor-memory 1G \
/home/etluser/kong/debugTest/pucchSinr.jar

也就是通过上面launcher生成的cmd

org.apache.spark.launcher.Main类

1.set -o posix

set命令是shell解释器的一个内置命令，用来设置shell解释器的属性，从而能够控制shell解释器的一些行为。

在set命令中，选项前面跟着 - 号表示开启这个选项， + 表示关闭这个选项。

POSIX，Portable Operating System Interface。
是UNIX系统的一个设计标准，很多类UNIX系统也在支持兼容这个标准，如Linux。
遵循这个标准的好处是软件可以跨平台。
所以windows也支持就很容易理解了，那么多优秀的开源软件，支持了这个这些软件就可能有windows版本，就可以完善丰富windows下的软件。
set -o posix：开启bash的posix模式。

2.command -v java

command [-pVv] command [arg ...]

用command指定可取消正常的shell function寻找。只有内建命令及在PATH中找得到的才会被执行。

"-p"选项，搜寻命令的方式是用PATH来找。"-V"或"-v"选项，会显示出该命令的一些简约描述。

4.read -d

-d ：表示delimiter，即定界符，一般情况下是以IFS为参数的间隔，但是通过-d，我们可以定义一直读到出现执行的字符位置。例如read –d madfds value，读到有m的字符的时候就不在继续向后读，例如输入为 hello m，有效值为“hello”，请注意m前面的空格等会被删除。这种方式可以输入多个字符串，例如定义“.”作为结符号等等

read命令 -n(不换行) -p(提示语句) -n(字符个数) -t(等待时间) -s(不回显)

来源：https://www.cnblogs.com/dtmobile-ksw/p/11940737.html

标签

spark

linux脚本

launcher

executor

cmd命令