问题
There's a Hadoop job I'm trying to run, and when I specify the input as 28
repetitions of my toy data everything works perfectly, however, when I crank it to 29
the whole thing crashes.
My idea is that there isn't anything wrong with the logic of the code, as it works for 28
repetitions but not 29
.
Here is what 2
repetitions of the input data looks like (repetitions are not to be confused with input files
, it rather refers to the number of ones prepended to that long numeric string, i.e. 0101
):
> evm --debug --code 7f00000000000000000000000000000000000000000000000000000000000000027f00000000000000000000000000000000000000000000000000000000000000027f00000000000000000000000000000000000000000000000000000000000000020101 run
opAdd 3903
opAdd 425
opStop 15
#### TRACE ####
PUSH32 pc=00000000 gas=9999999997 cost=3
PUSH32 pc=00000033 gas=9999999994 cost=3
Stack:
00000000 0000000000000000000000000000000000000000000000000000000000000002
PUSH32 pc=00000066 gas=9999999991 cost=3
Stack:
00000000 0000000000000000000000000000000000000000000000000000000000000002
00000001 0000000000000000000000000000000000000000000000000000000000000002
ADD pc=00000099 gas=9999999988 cost=3
Stack:
00000000 0000000000000000000000000000000000000000000000000000000000000002
00000001 0000000000000000000000000000000000000000000000000000000000000002
00000002 0000000000000000000000000000000000000000000000000000000000000002
ADD pc=00000100 gas=9999999985 cost=3
Stack:
00000000 0000000000000000000000000000000000000000000000000000000000000004
00000001 0000000000000000000000000000000000000000000000000000000000000002
STOP pc=00000101 gas=9999999985 cost=0
Stack:
00000000 0000000000000000000000000000000000000000000000000000000000000006
#### LOGS ####
0x%
The code of the running job looks like this:
import java.io.*;
import java.util.ArrayList;
import java.io.IOException;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.log4j.Logger;
public class ExecutionTimeTracker {
public static class TokenizerMapper
extends Mapper<Object, Text, Text, IntWritable>{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
private final Logger LOG = org.apache.log4j.Logger.getLogger(this.getClass());
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
FileSplit fileSplit = (FileSplit)context.getInputSplit();
String filename = fileSplit.getPath().getName();
//to write the file name as key
Text text = new Text();
text.set(filename);
LOG.warn("fileName: " + filename);
try {
// command execution
Runtime rt = Runtime.getRuntime();
String evmDir = "/home/ubuntu/go/src/github.com/ethereum/go-ethereum/build/bin/evm";
String command = evmDir + " --debug --code " + value.toString() + " run";
Process proc = Runtime.getRuntime().exec(command);
BufferedReader stdInput = new BufferedReader(new InputStreamReader(proc.getInputStream()));
// output data struct
ArrayList<String> consoleOutput = new ArrayList<String>();
String s = null;
while ((s = stdInput.readLine()) != null) {
consoleOutput.add(s);
}
for (String p : consoleOutput) {
Pattern pattern = Pattern.compile("([A-Za-z]+)([ \t]+)(\\d+)");
Matcher matcher = pattern.matcher(p);
while (matcher.find()) {
String groupThree = matcher.group(3);
IntWritable writeValue = new IntWritable(Integer.parseInt(groupThree));
context.write(text, writeValue);
}
}
// close to prevent memory leak
stdInput.close();
} catch (IOException e) {
LOG.warn("Exception Encountered!");
LOG.warn(e);
}
}
}
public static class IntSumReducer
extends Reducer<Text,IntWritable,Text,IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values,
Context context
) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "ExecutionTimeTracker");
job.setJarByClass(ExecutionTimeTracker.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
The output for a successful job can be found below:
17/10/13 02:17:10 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
17/10/13 02:17:11 INFO client.RMProxy: Connecting to ResourceManager at master/172.31.46.70:8032
17/10/13 02:17:11 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
17/10/13 02:17:11 INFO input.FileInputFormat: Total input files to process : 1
17/10/13 02:17:12 INFO mapreduce.JobSubmitter: number of splits:1
17/10/13 02:17:12 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1507833515636_0006
17/10/13 02:17:12 INFO impl.YarnClientImpl: Submitted application application_1507833515636_0006
17/10/13 02:17:12 INFO mapreduce.Job: The url to track the job: http://master:8088/proxy/application_1507833515636_0006/
17/10/13 02:17:12 INFO mapreduce.Job: Running job: job_1507833515636_0006
17/10/13 02:17:22 INFO mapreduce.Job: Job job_1507833515636_0006 running in uber mode : true
17/10/13 02:17:22 INFO mapreduce.Job: map 100% reduce 0%
17/10/13 02:17:25 INFO mapreduce.Job: map 100% reduce 100%
17/10/13 02:17:26 INFO mapreduce.Job: Job job_1507833515636_0006 completed successfully
17/10/13 02:17:26 INFO mapreduce.Job: Counters: 52
File System Counters
FILE: Number of bytes read=64
FILE: Number of bytes written=112
FILE: Number of read operations=0
FILE: Number of large read operations=0
FILE: Number of write operations=0
HDFS: Number of bytes read=4042
HDFS: Number of bytes written=295682
HDFS: Number of read operations=35
HDFS: Number of large read operations=0
HDFS: Number of write operations=8
Job Counters
Launched map tasks=1
Launched reduce tasks=1
Other local map tasks=1
Total time spent by all maps in occupied slots (ms)=501
Total time spent by all reduces in occupied slots (ms)=2415
TOTAL_LAUNCHED_UBERTASKS=2
NUM_UBER_SUBMAPS=1
NUM_UBER_SUBREDUCES=1
Total time spent by all map tasks (ms)=501
Total time spent by all reduce tasks (ms)=2415
Total vcore-milliseconds taken by all map tasks=501
Total vcore-milliseconds taken by all reduce tasks=2415
Total megabyte-milliseconds taken by all map tasks=513024
Total megabyte-milliseconds taken by all reduce tasks=2472960
Map-Reduce Framework
Map input records=1
Map output records=56
Map output bytes=448
Map output materialized bytes=16
Input split bytes=96
Combine input records=56
Combine output records=1
Reduce input groups=1
Reduce shuffle bytes=16
Reduce input records=1
Reduce output records=1
Spilled Records=2
Shuffled Maps =1
Failed Shuffles=0
Merged Map outputs=1
GC time elapsed (ms)=42
CPU time spent (ms)=2060
Physical memory (bytes) snapshot=971251712
Virtual memory (bytes) snapshot=5902385152
Total committed heap usage (bytes)=745537536
Shuffle Errors
BAD_ID=0
CONNECTION=0
IO_ERROR=0
WRONG_LENGTH=0
WRONG_MAP=0
WRONG_REDUCE=0
File Input Format Counters
Bytes Read=1903
File Output Format Counters
Bytes Written=10
The full log of the slave node that performed this task can be found here.
Here is the output from an unsuccessful job:
17/10/12 20:42:41 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
17/10/12 20:42:42 INFO client.RMProxy: Connecting to ResourceManager at master/xxx.xxx.xxx.xxx:8032
17/10/12 20:42:42 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
17/10/12 20:42:42 INFO input.FileInputFormat: Total input files to process : 1
17/10/12 20:42:43 INFO mapreduce.JobSubmitter: number of splits:1
17/10/12 20:42:43 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1507833515636_0005
17/10/12 20:42:44 INFO impl.YarnClientImpl: Submitted application application_1507833515636_0005
17/10/12 20:42:44 INFO mapreduce.Job: The url to track the job: http://master:8088/proxy/application_1507833515636_0005/
17/10/12 20:42:44 INFO mapreduce.Job: Running job: job_1507833515636_0005
17/10/12 20:42:49 INFO mapreduce.Job: Job job_1507833515636_0005 running in uber mode : true
17/10/12 20:42:49 INFO mapreduce.Job: map 0% reduce 0%
17/10/12 20:43:01 INFO mapreduce.Job: map 67% reduce 0%
17/10/12 20:53:19 INFO mapreduce.Job: map 100% reduce 100%
17/10/12 20:53:19 INFO mapreduce.Job: Job job_1507833515636_0005 failed with state FAILED due to: Task failed task_1507833515636_0005_m_000000
Job failed as tasks failed. failedMaps:1 failedReduces:0
17/10/12 20:53:19 INFO mapreduce.Job: Counters: 18
Job Counters
Failed map tasks=1
Killed reduce tasks=1
Launched map tasks=1
Launched reduce tasks=1
Other local map tasks=1
Total time spent by all maps in occupied slots (ms)=629774
Total time spent by all reduces in occupied slots (ms)=1
TOTAL_LAUNCHED_UBERTASKS=1
NUM_UBER_SUBMAPS=1
Total time spent by all map tasks (ms)=629774
Total time spent by all reduce tasks (ms)=1
Total vcore-milliseconds taken by all map tasks=629774
Total vcore-milliseconds taken by all reduce tasks=1
Total megabyte-milliseconds taken by all map tasks=644888576
Total megabyte-milliseconds taken by all reduce tasks=1024
Map-Reduce Framework
CPU time spent (ms)=0
Physical memory (bytes) snapshot=0
Virtual memory (bytes) snapshot=0
The full output error log, as recorded by the slave node that executed the task, can be found here.
As these jobs are running in uber mode, that should obviate many of the potential causes of this issue- however- as of yet, I've not been able to put my finger on the particular issue- open to all suggestions and insights! :)
Maybe it has something to do with the memory bounds of each individual container?
Here's what my configuration files look like:
mapred-site.xml
:
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<property>
<name>mapreduce.job.ubertask.enable</name>
<value>true</value>
</property>
</configuration>
yarn-site.xml
:
<configuration>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name>
<value>org.apache.hadoop.mapred.ShuffleHandler</value>
</property>
<property>
<name>yarn.resourcemanager.hostname</name>
<value>master</value>
</property>
<property>
<name>yarn.log-aggregation-enable</name>
<value>true</value>
</property>
<property>
<name>yarn.nodemanager.resource.memory-mb</name>
<value>40960</value>
</property>
<property>
<name>yarn.scheduler.minimum-allocation-mb</name>
<value>2048</value>
</property>
<property>
<name>yarn.nodemanager.vmem-pmem-ratio</name>
<value>2.1</value>
</property>
<property>
<name>yarn.nodemanager.vmem-check-enabled</name>
<value>false</value>
<description>Whether virtual memory limits will be enforced for containers</description>
</property>
</configuration>
hdfs-site.xml
:
<configuration>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
<property>
<name>dfs.namenode.name.dir</name>
<value>file:/usr/local/hadoop_work/hdfs/namenode</value>
</property>
<property>
<name>dfs.namenode.checkpoint.dir</name>
<value>file:/usr/local/hadoop_work/hdfs/namesecondary</value>
</property>
<property>
<name>dfs.datanode.data.dir</name>
<value>file:/usr/local/hadoop_work/hdfs/datanode</value>
</property>
<property>
<name>dfs.secondary.http.address</name>
<value>xxx.xxx.xxx.xxx:50090</value>
</property>
<property>
<name>dfs.block.size</name>
<value>134217728</value>
<description>Block size</description>
</property>
</configuration>
来源:https://stackoverflow.com/questions/46721969/debugging-why-a-hadoop-job-fails-with-varying-input