Problem while implementing join of two dataset in google cloud dataflow using Apache Beam

老子叫甜甜 提交于 2019-12-08 05:09:35

问题


I was trying to implement SQL on two dataset on google cloud storage using apache beam by following Apache Beam documentation https://beam.apache.org/documentation/dsls/sql/walkthrough/

But i am ended with the below exception : An exception occured while executing the Java class. org.apache.beam.sdk.transforms.MapElements .via(Lorg/apache/beam/sdk/transforms/SimpleFunction;)Lorg/apache/beam/sdk/transforms/MapElements;

I tried changing Beam-sdk-version and other code changes but none of them worked .

package com.nitesh.gcp.feature;

import org.apache.beam.runners.dataflow.options.DataflowPipelineOptions;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.extensions.sql.SqlTransform;
import org.apache.beam.sdk.io.TextIO;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.schemas.Schema;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.PCollectionTuple;
import org.apache.beam.sdk.values.Row;
import org.apache.beam.sdk.values.TupleTag;

import java.util.stream.Collectors;


public class beamSQL1 {

    public static final String EMPHEADER = "empno,ename,job,mgr,hiredate,sal,comm,deptno";
    public static final String DEPTHEADER = "deptno,dname,location";
    public static final Schema EMPSCHEMA = Schema.builder()
            .addStringField("empno")
            .addStringField("ename")
            .addStringField("job")
            .addStringField("mgr")
            .addStringField("hiredate")
            .addStringField("sal")
            .addStringField("comm")
            .addStringField("deptno")
            .build();
    public static final Schema DEPTSCHEMA = Schema.builder()
            .addStringField("deptno")
            .addStringField("dname")
            .addStringField("location")
            .build();


    public static void main(String[] args) {
        PipelineOptionsFactory.register(DataflowPipelineOptions.class);
        DataflowPipelineOptions options = PipelineOptionsFactory
                .fromArgs(args)
                .withValidation()
                .as(DataflowPipelineOptions.class);

        Pipeline pipeline = Pipeline.create(options);
        PCollection<String> employee = pipeline.apply("Read From GCS", TextIO.read().from("gs://amazon-test/sqlData/employee.txt"));
        PCollection<String> department = pipeline.apply("Read From GCS", TextIO.read().from("gs://amazon-test/sqlData/department.txt"));
        PCollection<Row> employeeRow = employee.apply("Transform To Row", ParDo.of(new RowParDo())).setRowSchema(EMPSCHEMA);
        PCollection<Row> departmentRow = department.apply("Transform To Row", ParDo.of(new RowParDoForDept())).setRowSchema(DEPTSCHEMA);
        PCollectionTuple output = PCollectionTuple.of(new TupleTag<>("emp"), employeeRow).and(new TupleTag<>("dept"), departmentRow);


        output.apply(
                SqlTransform.query(
                        // "SELECT emp.empno,emp.ename,dept.deptno,dept.dname FROM emp JOIN dept ON emp.deptno = dept.deptno"))
                        "SELECT * from emp JOIN dept ON emp.deptno = dept.deptno"))

                /* p2.apply("Transform Sql", SqlTransform.query(
                         "SELECT * " +
                                 "FROM PCOLLECTION order by sal desc LIMIT 14")
                 )*/
                .apply("TransForm To String", ParDo.of(new RowToString()))
                .apply("Write To GCS", TextIO.write().to("gs://amazon-test/sqlData/output/outputSql.csv").withoutSharding());

        pipeline.run();
    }

    //ParDo for String -> Row (SQL)
    public static class RowParDo extends DoFn<String, Row> {
        @ProcessElement
        public void processElement(ProcessContext c) {
            if (!c.element().equalsIgnoreCase(EMPHEADER)) {
                String[] vals = c.element().split(",(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)");
                Row appRow = Row
                        .withSchema(EMPSCHEMA)
                        .addValues(vals[0], vals[1], vals[2], vals[3], vals[4], vals[5], vals[6], vals[7])
                        .build();
                c.output(appRow);
            }
        }
    }

    //ParDo for Row (SQL) -> String
    public static class RowToString extends DoFn<Row, String> {
        @ProcessElement
        public void processElement(ProcessContext c) {
            String line = c.element().getValues()
                    .stream()
                    .map(Object::toString)
                    .collect(Collectors.joining(","));
            c.output(line);
        }
    }

    //ParDo for String -> Row (SQL)
    public static class RowParDoForDept extends DoFn<String, Row> {
        @ProcessElement
        public void processElement(ProcessContext c) {
            if (!c.element().equalsIgnoreCase(DEPTHEADER)) {
                String[] vals = c.element().split(",(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)");
                Row appRow = Row
                        .withSchema(DEPTSCHEMA)
                        .addValues(vals[0], vals[1], vals[2])
                        .build();
                c.output(appRow);
            }
        }
    }


}

来源:https://stackoverflow.com/questions/55942197/problem-while-implementing-join-of-two-dataset-in-google-cloud-dataflow-using-ap

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!