How to execute spark submit on amazon EMR from Lambda function?

后端 未结 2 568
忘了有多久
忘了有多久 2020-12-13 01:11

I want to execute spark submit job on AWS EMR cluster based on the file upload event on S3. I am using AWS Lambda function to capture the event but I have no idea how to sub

相关标签:
2条回答
  • 2020-12-13 01:28

    You can, I had to same thing last week!

    Using boto3 for Python (other languages would definitely have a similar solution) you can either start a cluster with the defined step, or attach a step to an already up cluster.

    Defining the cluster with the step

    def lambda_handler(event, context):
        conn = boto3.client("emr")        
        cluster_id = conn.run_job_flow(
            Name='ClusterName',
            ServiceRole='EMR_DefaultRole',
            JobFlowRole='EMR_EC2_DefaultRole',
            VisibleToAllUsers=True,
            LogUri='s3n://some-log-uri/elasticmapreduce/',
            ReleaseLabel='emr-5.8.0',
            Instances={
                'InstanceGroups': [
                    {
                        'Name': 'Master nodes',
                        'Market': 'ON_DEMAND',
                        'InstanceRole': 'MASTER',
                        'InstanceType': 'm3.xlarge',
                        'InstanceCount': 1,
                    },
                    {
                        'Name': 'Slave nodes',
                        'Market': 'ON_DEMAND',
                        'InstanceRole': 'CORE',
                        'InstanceType': 'm3.xlarge',
                        'InstanceCount': 2,
                    }
                ],
                'Ec2KeyName': 'key-name',
                'KeepJobFlowAliveWhenNoSteps': False,
                'TerminationProtected': False
            },
            Applications=[{
                'Name': 'Spark'
            }],
            Configurations=[{
                "Classification":"spark-env",
                "Properties":{},
                "Configurations":[{
                    "Classification":"export",
                    "Properties":{
                        "PYSPARK_PYTHON":"python35",
                        "PYSPARK_DRIVER_PYTHON":"python35"
                    }
                }]
            }],
            BootstrapActions=[{
                'Name': 'Install',
                'ScriptBootstrapAction': {
                    'Path': 's3://path/to/bootstrap.script'
                }
            }],
            Steps=[{
                'Name': 'StepName',
                'ActionOnFailure': 'TERMINATE_CLUSTER',
                'HadoopJarStep': {
                    'Jar': 's3n://elasticmapreduce/libs/script-runner/script-runner.jar',
                    'Args': [
                        "/usr/bin/spark-submit", "--deploy-mode", "cluster",
                        's3://path/to/code.file', '-i', 'input_arg', 
                        '-o', 'output_arg'
                    ]
                }
            }],
        )
        return "Started cluster {}".format(cluster_id)
    

    Attaching a step to an already running cluster

    As per here

    def lambda_handler(event, context):
        conn = boto3.client("emr")
        # chooses the first cluster which is Running or Waiting
        # possibly can also choose by name or already have the cluster id
        clusters = conn.list_clusters()
        # choose the correct cluster
        clusters = [c["Id"] for c in clusters["Clusters"] 
                    if c["Status"]["State"] in ["RUNNING", "WAITING"]]
        if not clusters:
            sys.stderr.write("No valid clusters\n")
            sys.stderr.exit()
        # take the first relevant cluster
        cluster_id = clusters[0]
        # code location on your emr master node
        CODE_DIR = "/home/hadoop/code/"
    
        # spark configuration example
        step_args = ["/usr/bin/spark-submit", "--spark-conf", "your-configuration",
                     CODE_DIR + "your_file.py", '--your-parameters', 'parameters']
    
        step = {"Name": "what_you_do-" + time.strftime("%Y%m%d-%H:%M"),
                'ActionOnFailure': 'CONTINUE',
                'HadoopJarStep': {
                    'Jar': 's3n://elasticmapreduce/libs/script-runner/script-runner.jar',
                    'Args': step_args
                }
            }
        action = conn.add_job_flow_steps(JobFlowId=cluster_id, Steps=[step])
        return "Added step: %s"%(action)
    
    0 讨论(0)
  • 2020-12-13 01:40

    AWS Lambda function python code if you want to execute Spark jar using spark submit command:

    from botocore.vendored import requests
    
    import json
    
    def lambda_handler(event, context):
    
    headers = { "content-type": "application/json" }
    
      url = 'http://ip-address.ec2.internal:8998/batches'
    
      payload = {
    
        'file' : 's3://Bucket/Orchestration/RedshiftJDBC41.jar 
    s3://Bucket/Orchestration/mysql-connector-java-8.0.12.jar 
    
    s3://Bucket/Orchestration/SparkCode.jar',
    
        'className' : 'Main Class Name',
    
        'args' : [event.get('rootPath')]
    
      }
    
      res = requests.post(url, data = json.dumps(payload), headers = headers, verify = False)
    
      json_data = json.loads(res.text)
    
      return json_data.get('id')
    
    0 讨论(0)
提交回复
热议问题