问题
I was trying to implement SQL on two dataset on google cloud storage using apache beam by following Apache Beam documentation https://beam.apache.org/documentation/dsls/sql/walkthrough/
But i am ended with the below exception : An exception occured while executing the Java class. org.apache.beam.sdk.transforms.MapElements .via(Lorg/apache/beam/sdk/transforms/SimpleFunction;)Lorg/apache/beam/sdk/transforms/MapElements;
I tried changing Beam-sdk-version and other code changes but none of them worked .
package com.nitesh.gcp.feature;
import org.apache.beam.runners.dataflow.options.DataflowPipelineOptions;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.extensions.sql.SqlTransform;
import org.apache.beam.sdk.io.TextIO;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.schemas.Schema;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.PCollectionTuple;
import org.apache.beam.sdk.values.Row;
import org.apache.beam.sdk.values.TupleTag;
import java.util.stream.Collectors;
public class beamSQL1 {
public static final String EMPHEADER = "empno,ename,job,mgr,hiredate,sal,comm,deptno";
public static final String DEPTHEADER = "deptno,dname,location";
public static final Schema EMPSCHEMA = Schema.builder()
.addStringField("empno")
.addStringField("ename")
.addStringField("job")
.addStringField("mgr")
.addStringField("hiredate")
.addStringField("sal")
.addStringField("comm")
.addStringField("deptno")
.build();
public static final Schema DEPTSCHEMA = Schema.builder()
.addStringField("deptno")
.addStringField("dname")
.addStringField("location")
.build();
public static void main(String[] args) {
PipelineOptionsFactory.register(DataflowPipelineOptions.class);
DataflowPipelineOptions options = PipelineOptionsFactory
.fromArgs(args)
.withValidation()
.as(DataflowPipelineOptions.class);
Pipeline pipeline = Pipeline.create(options);
PCollection<String> employee = pipeline.apply("Read From GCS", TextIO.read().from("gs://amazon-test/sqlData/employee.txt"));
PCollection<String> department = pipeline.apply("Read From GCS", TextIO.read().from("gs://amazon-test/sqlData/department.txt"));
PCollection<Row> employeeRow = employee.apply("Transform To Row", ParDo.of(new RowParDo())).setRowSchema(EMPSCHEMA);
PCollection<Row> departmentRow = department.apply("Transform To Row", ParDo.of(new RowParDoForDept())).setRowSchema(DEPTSCHEMA);
PCollectionTuple output = PCollectionTuple.of(new TupleTag<>("emp"), employeeRow).and(new TupleTag<>("dept"), departmentRow);
output.apply(
SqlTransform.query(
// "SELECT emp.empno,emp.ename,dept.deptno,dept.dname FROM emp JOIN dept ON emp.deptno = dept.deptno"))
"SELECT * from emp JOIN dept ON emp.deptno = dept.deptno"))
/* p2.apply("Transform Sql", SqlTransform.query(
"SELECT * " +
"FROM PCOLLECTION order by sal desc LIMIT 14")
)*/
.apply("TransForm To String", ParDo.of(new RowToString()))
.apply("Write To GCS", TextIO.write().to("gs://amazon-test/sqlData/output/outputSql.csv").withoutSharding());
pipeline.run();
}
//ParDo for String -> Row (SQL)
public static class RowParDo extends DoFn<String, Row> {
@ProcessElement
public void processElement(ProcessContext c) {
if (!c.element().equalsIgnoreCase(EMPHEADER)) {
String[] vals = c.element().split(",(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)");
Row appRow = Row
.withSchema(EMPSCHEMA)
.addValues(vals[0], vals[1], vals[2], vals[3], vals[4], vals[5], vals[6], vals[7])
.build();
c.output(appRow);
}
}
}
//ParDo for Row (SQL) -> String
public static class RowToString extends DoFn<Row, String> {
@ProcessElement
public void processElement(ProcessContext c) {
String line = c.element().getValues()
.stream()
.map(Object::toString)
.collect(Collectors.joining(","));
c.output(line);
}
}
//ParDo for String -> Row (SQL)
public static class RowParDoForDept extends DoFn<String, Row> {
@ProcessElement
public void processElement(ProcessContext c) {
if (!c.element().equalsIgnoreCase(DEPTHEADER)) {
String[] vals = c.element().split(",(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)");
Row appRow = Row
.withSchema(DEPTSCHEMA)
.addValues(vals[0], vals[1], vals[2])
.build();
c.output(appRow);
}
}
}
}
来源:https://stackoverflow.com/questions/55942197/problem-while-implementing-join-of-two-dataset-in-google-cloud-dataflow-using-ap