Debugging why a Hadoop job fails with varying input

问题

There's a Hadoop job I'm trying to run, and when I specify the input as 28 repetitions of my toy data everything works perfectly, however, when I crank it to 29 the whole thing crashes.

My idea is that there isn't anything wrong with the logic of the code, as it works for 28 repetitions but not 29.

Here is what 2 repetitions of the input data looks like (repetitions are not to be confused with input files, it rather refers to the number of ones prepended to that long numeric string, i.e. 0101):

> evm --debug --code 7f00000000000000000000000000000000000000000000000000000000000000027f00000000000000000000000000000000000000000000000000000000000000027f00000000000000000000000000000000000000000000000000000000000000020101 run
opAdd      3903
opAdd      425
opStop       15
#### TRACE ####
PUSH32          pc=00000000 gas=9999999997 cost=3

PUSH32          pc=00000033 gas=9999999994 cost=3
Stack:
00000000  0000000000000000000000000000000000000000000000000000000000000002

PUSH32          pc=00000066 gas=9999999991 cost=3
Stack:
00000000  0000000000000000000000000000000000000000000000000000000000000002
00000001  0000000000000000000000000000000000000000000000000000000000000002

ADD             pc=00000099 gas=9999999988 cost=3
Stack:
00000000  0000000000000000000000000000000000000000000000000000000000000002
00000001  0000000000000000000000000000000000000000000000000000000000000002
00000002  0000000000000000000000000000000000000000000000000000000000000002

ADD             pc=00000100 gas=9999999985 cost=3
Stack:
00000000  0000000000000000000000000000000000000000000000000000000000000004
00000001  0000000000000000000000000000000000000000000000000000000000000002

STOP            pc=00000101 gas=9999999985 cost=0
Stack:
00000000  0000000000000000000000000000000000000000000000000000000000000006

#### LOGS ####
0x%

The code of the running job looks like this:

import java.io.*;
import java.util.ArrayList;
import java.io.IOException;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.log4j.Logger;

public class ExecutionTimeTracker {

  public static class TokenizerMapper
       extends Mapper<Object, Text, Text, IntWritable>{

    private final static IntWritable one = new IntWritable(1);
    private Text word = new Text();
    private final Logger LOG = org.apache.log4j.Logger.getLogger(this.getClass());

    public void map(Object key, Text value, Context context
                    ) throws IOException, InterruptedException {
      StringTokenizer itr = new StringTokenizer(value.toString());

      FileSplit fileSplit = (FileSplit)context.getInputSplit();
      String filename = fileSplit.getPath().getName();

      //to write the file name as key
      Text text = new Text();
      text.set(filename);
      LOG.warn("fileName: " + filename);

      try {
          // command execution
          Runtime rt = Runtime.getRuntime();
          String evmDir = "/home/ubuntu/go/src/github.com/ethereum/go-ethereum/build/bin/evm";
          String command = evmDir + " --debug --code " + value.toString() + " run";
          Process proc = Runtime.getRuntime().exec(command);
          BufferedReader stdInput = new BufferedReader(new InputStreamReader(proc.getInputStream()));

          // output data struct
          ArrayList<String> consoleOutput = new ArrayList<String>();
          String s = null;
          while ((s = stdInput.readLine()) != null) {
              consoleOutput.add(s);
          }
          for (String p : consoleOutput) {
              Pattern pattern = Pattern.compile("([A-Za-z]+)([ \t]+)(\\d+)");
              Matcher matcher = pattern.matcher(p);
              while (matcher.find()) {
                  String groupThree = matcher.group(3);
                  IntWritable writeValue = new IntWritable(Integer.parseInt(groupThree));
                  context.write(text, writeValue);
              }
          }
          // close to prevent memory leak
          stdInput.close();
      } catch (IOException e) {
          LOG.warn("Exception Encountered!");
          LOG.warn(e);
      }
    }
  }

  public static class IntSumReducer
       extends Reducer<Text,IntWritable,Text,IntWritable> {
    private IntWritable result = new IntWritable();

    public void reduce(Text key, Iterable<IntWritable> values,
                       Context context
                       ) throws IOException, InterruptedException {
      int sum = 0;
      for (IntWritable val : values) {
        sum += val.get();
      }
      result.set(sum);
      context.write(key, result);
    }
  }

  public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf, "ExecutionTimeTracker");
    job.setJarByClass(ExecutionTimeTracker.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
  }
}

The output for a successful job can be found below:

17/10/13 02:17:10 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
17/10/13 02:17:11 INFO client.RMProxy: Connecting to ResourceManager at master/172.31.46.70:8032
17/10/13 02:17:11 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
17/10/13 02:17:11 INFO input.FileInputFormat: Total input files to process : 1
17/10/13 02:17:12 INFO mapreduce.JobSubmitter: number of splits:1
17/10/13 02:17:12 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1507833515636_0006
17/10/13 02:17:12 INFO impl.YarnClientImpl: Submitted application application_1507833515636_0006
17/10/13 02:17:12 INFO mapreduce.Job: The url to track the job: http://master:8088/proxy/application_1507833515636_0006/
17/10/13 02:17:12 INFO mapreduce.Job: Running job: job_1507833515636_0006
17/10/13 02:17:22 INFO mapreduce.Job: Job job_1507833515636_0006 running in uber mode : true
17/10/13 02:17:22 INFO mapreduce.Job:  map 100% reduce 0%
17/10/13 02:17:25 INFO mapreduce.Job:  map 100% reduce 100%
17/10/13 02:17:26 INFO mapreduce.Job: Job job_1507833515636_0006 completed successfully
17/10/13 02:17:26 INFO mapreduce.Job: Counters: 52
    File System Counters
        FILE: Number of bytes read=64
        FILE: Number of bytes written=112
        FILE: Number of read operations=0
        FILE: Number of large read operations=0
        FILE: Number of write operations=0
        HDFS: Number of bytes read=4042
        HDFS: Number of bytes written=295682
        HDFS: Number of read operations=35
        HDFS: Number of large read operations=0
        HDFS: Number of write operations=8
    Job Counters 
        Launched map tasks=1
        Launched reduce tasks=1
        Other local map tasks=1
        Total time spent by all maps in occupied slots (ms)=501
        Total time spent by all reduces in occupied slots (ms)=2415
        TOTAL_LAUNCHED_UBERTASKS=2
        NUM_UBER_SUBMAPS=1
        NUM_UBER_SUBREDUCES=1
        Total time spent by all map tasks (ms)=501
        Total time spent by all reduce tasks (ms)=2415
        Total vcore-milliseconds taken by all map tasks=501
        Total vcore-milliseconds taken by all reduce tasks=2415
        Total megabyte-milliseconds taken by all map tasks=513024
        Total megabyte-milliseconds taken by all reduce tasks=2472960
    Map-Reduce Framework
        Map input records=1
        Map output records=56
        Map output bytes=448
        Map output materialized bytes=16
        Input split bytes=96
        Combine input records=56
        Combine output records=1
        Reduce input groups=1
        Reduce shuffle bytes=16
        Reduce input records=1
        Reduce output records=1
        Spilled Records=2
        Shuffled Maps =1
        Failed Shuffles=0
        Merged Map outputs=1
        GC time elapsed (ms)=42
        CPU time spent (ms)=2060
        Physical memory (bytes) snapshot=971251712
        Virtual memory (bytes) snapshot=5902385152
        Total committed heap usage (bytes)=745537536
    Shuffle Errors
        BAD_ID=0
        CONNECTION=0
        IO_ERROR=0
        WRONG_LENGTH=0
        WRONG_MAP=0
        WRONG_REDUCE=0
    File Input Format Counters 
        Bytes Read=1903
    File Output Format Counters 
        Bytes Written=10

The full log of the slave node that performed this task can be found here.

Here is the output from an unsuccessful job:

17/10/12 20:42:41 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
17/10/12 20:42:42 INFO client.RMProxy: Connecting to ResourceManager at master/xxx.xxx.xxx.xxx:8032
17/10/12 20:42:42 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
17/10/12 20:42:42 INFO input.FileInputFormat: Total input files to process : 1
17/10/12 20:42:43 INFO mapreduce.JobSubmitter: number of splits:1
17/10/12 20:42:43 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1507833515636_0005
17/10/12 20:42:44 INFO impl.YarnClientImpl: Submitted application application_1507833515636_0005
17/10/12 20:42:44 INFO mapreduce.Job: The url to track the job: http://master:8088/proxy/application_1507833515636_0005/
17/10/12 20:42:44 INFO mapreduce.Job: Running job: job_1507833515636_0005
17/10/12 20:42:49 INFO mapreduce.Job: Job job_1507833515636_0005 running in uber mode : true
17/10/12 20:42:49 INFO mapreduce.Job:  map 0% reduce 0%
17/10/12 20:43:01 INFO mapreduce.Job:  map 67% reduce 0%
17/10/12 20:53:19 INFO mapreduce.Job:  map 100% reduce 100%
17/10/12 20:53:19 INFO mapreduce.Job: Job job_1507833515636_0005 failed with state FAILED due to: Task failed task_1507833515636_0005_m_000000
Job failed as tasks failed. failedMaps:1 failedReduces:0

17/10/12 20:53:19 INFO mapreduce.Job: Counters: 18
    Job Counters 
        Failed map tasks=1
        Killed reduce tasks=1
        Launched map tasks=1
        Launched reduce tasks=1
        Other local map tasks=1
        Total time spent by all maps in occupied slots (ms)=629774
        Total time spent by all reduces in occupied slots (ms)=1
        TOTAL_LAUNCHED_UBERTASKS=1
        NUM_UBER_SUBMAPS=1
        Total time spent by all map tasks (ms)=629774
        Total time spent by all reduce tasks (ms)=1
        Total vcore-milliseconds taken by all map tasks=629774
        Total vcore-milliseconds taken by all reduce tasks=1
        Total megabyte-milliseconds taken by all map tasks=644888576
        Total megabyte-milliseconds taken by all reduce tasks=1024
    Map-Reduce Framework
        CPU time spent (ms)=0
        Physical memory (bytes) snapshot=0
        Virtual memory (bytes) snapshot=0

The full output error log, as recorded by the slave node that executed the task, can be found here.

As these jobs are running in uber mode, that should obviate many of the potential causes of this issue- however- as of yet, I've not been able to put my finger on the particular issue- open to all suggestions and insights! :)

Maybe it has something to do with the memory bounds of each individual container?

Here's what my configuration files look like:

mapred-site.xml:

<configuration>
  <property>
     <name>mapreduce.framework.name</name>
     <value>yarn</value>
  </property>
  <property>
     <name>mapreduce.job.ubertask.enable</name>
     <value>true</value>
  </property>
</configuration>

yarn-site.xml:

<configuration>
  <property>
     <name>yarn.nodemanager.aux-services</name>
     <value>mapreduce_shuffle</value>
  </property>
  <property>
     <name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name>
     <value>org.apache.hadoop.mapred.ShuffleHandler</value>
  </property>
  <property>
    <name>yarn.resourcemanager.hostname</name>
    <value>master</value>
  </property>
  <property>
    <name>yarn.log-aggregation-enable</name>
    <value>true</value>
  </property>
  <property>
    <name>yarn.nodemanager.resource.memory-mb</name>
    <value>40960</value>
  </property>
  <property>
    <name>yarn.scheduler.minimum-allocation-mb</name>
    <value>2048</value>
  </property>
  <property>
    <name>yarn.nodemanager.vmem-pmem-ratio</name>
    <value>2.1</value>
  </property>
<property>
   <name>yarn.nodemanager.vmem-check-enabled</name>
   <value>false</value>
   <description>Whether virtual memory limits will be enforced for containers</description>
</property>
</configuration>

hdfs-site.xml:

<configuration>
  <property>
     <name>dfs.replication</name>
     <value>1</value>
  </property>
  <property>
     <name>dfs.namenode.name.dir</name>
     <value>file:/usr/local/hadoop_work/hdfs/namenode</value>
  </property>
  <property>
    <name>dfs.namenode.checkpoint.dir</name>
    <value>file:/usr/local/hadoop_work/hdfs/namesecondary</value>
  </property>
  <property>
     <name>dfs.datanode.data.dir</name>
     <value>file:/usr/local/hadoop_work/hdfs/datanode</value>
  </property>
  <property>
    <name>dfs.secondary.http.address</name>
    <value>xxx.xxx.xxx.xxx:50090</value>
  </property>
<property> 
<name>dfs.block.size</name> 
<value>134217728</value> 
<description>Block size</description> 
</property>
</configuration>

来源：https://stackoverflow.com/questions/46721969/debugging-why-a-hadoop-job-fails-with-varying-input

标签

Hadoop

MapReduce