spark reading data from mysql in parallel

后端未结

关注

 2  1253

忘了有多久 2020-12-13 22:40

Im trying to read data from mysql and write it back to parquet file in s3 with specific partitions as follows:

df=sqlContext.read.format(\'jdbc\')\\
   .opti


      
      
        
          2条回答        

        
                    
            
            
                         
                
              
              
                
                   轮回少年
                                             
                
                
                (楼主)
            
              
              
                2020-12-13 23:04
              

            
            
                        
For Spar >= 2.0
I've created a class with next methods:

...
private val dbUrl =
s"""jdbc:mysql://${host}:${port}/${db_name}
    |?zeroDateTimeBehavior=convertToNull
    |&read_buffer_size=100M""".stripMargin.replace("\n", "")

def run(sqlQuery: String): DataFrame = {
println(sqlQuery)
Datapipeline.spark.read
  .format("jdbc")
  .option("driver", "com.mysql.jdbc.Driver")
  .option("url", dbUrl)
  .option("user", user)
  .option("password", pass)
  .option("dbtable", s"($sqlQuery) as tmp")
  .load()
}
...
def getBounds(table: String, whereClause: String, partitionColumn: String): Array[Int] = {
val sql = s"select min($partitionColumn) as min, max($partitionColumn) as max from $table${
  if (whereClause.length > 0) s" where $whereClause"
}"
val df = run(sql).collect()(0)

Array(df.get(0).asInstanceOf[Int], df.get(1).asInstanceOf[Int])
}

def getTableFields(table: String): String = {
val sql =
  s"""
     |SELECT *
     |FROM information_schema.COLUMNS
     |WHERE table_name LIKE '$table'
     |  AND TABLE_SCHEMA LIKE '${db_name}'
     |ORDER BY ORDINAL_POSITION
   """.stripMargin
run(sql).collect().map(r => r.getAs[String]("COLUMN_NAME")).mkString(", ")
}

/**
* Returns DataFrame partitioned by  to number of partitions provided in
*  for a  with WHERE clause
* @param table - a table name
* @param whereClause - WHERE clause without "WHERE" key word
* @param partitionColumn - column name used for partitioning, should be numeric
* @param numPartitions - number of partitions
* @return - a DataFrame
*/
def run(table: String, whereClause: String, partitionColumn: String, numPartitions: Int): DataFrame = {
val bounds = getBounds(table, whereClause, partitionColumn)

val fields = getTableFields(table)
val dfs: Array[DataFrame] = new Array[DataFrame](numPartitions)

val lowerBound = bounds(0)
val partitionRange: Int = ((bounds(1) - bounds(0)) / numPartitions)

for (i <- 0 to numPartitions - 2) {
  dfs(i) = run(
    s"""select $fields from $table
        | where $partitionColumn >= ${lowerBound + (partitionRange * i)} and $partitionColumn < ${lowerBound + (partitionRange * (i + 1))}${
      if (whereClause.length > 0)
        s" and $whereClause"
    }
     """.stripMargin.replace("\n", ""))
}

dfs(numPartitions - 1) = run(s"select $fields from $table where $partitionColumn >= ${lowerBound + (partitionRange * (numPartitions - 1))}${
  if (whereClause.length > 0)
    s" and $whereClause"
}".replace("\n", ""))

dfs.reduceLeft((res, df) => res.union(df))

}
Last run method will create a number of necessary partitions. When you call an action method Spark will create as many parallel tasks as many partitions have been defined for the DataFrame returned by the run method.
Enjoy.

                                                        

              
                
                0
              
                   
                
               讨论(0)
              
                                                  
              
              
                          
             
       
          
              
                                       
     查看其它2个回答


            

                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                

          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复
            
          
        

          
 
     
 
        热议问题