Thursday, 24 January 2019

CSV Read operation in Spark with Scala

Input Data:
emp.csv
---------
empno,ename,designation,manager,hire_date,sal,deptno
7369,SMITH,CLERK,7902,12/17/1980,800,20
7499,ALLEN,SALESMAN,7698,2/20/1981,1600,30
7521,WARD,SALESMAN,7698,2/22/1981,1250,30
7566,TURNER,MANAGER,7839,4/2/1981,2975,20
7654,MARTIN,SALESMAN,7698,9/28/1981,1250,30
7698,MILLER,MANAGER,7839,5/1/1981,2850,30
7782,CLARK,MANAGER,7839,6/9/1981,2450,10
7788,SCOTT,ANALYST,7566,12/9/1982,3000,20
7839,KING,PRESIDENT,NULL,11/17/1981,5000,10
7844,TURNER,SALESMAN,7698,9/8/1981,1500,30
7876,ADAMS,CLERK,7788,1/12/1983,1100,20
7900,JAMES,CLERK,7698,12/3/1981,950,30
7902,FORD,ANALYST,7566,12/3/1981,3000,20
7934,MILLER,CLERK,7782,1/23/1982,1300,10


import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf,SparkContext}

object LoadCSV2RDDtoDF{
  case class Employee(empno:String, ename:String, designation:String, manager:String, hire_date:String, sal:String, deptno:String)
    def main(args:Array[String]):Unit ={
      val conf = new SparkConf().setAppName("CSV Loader").setMaster("local[*]")
      val sc = new SparkContext(conf)
      val sqlContext = new SQLContext(sc)
      import sqlContext.implicits._

      /*
      val txtRDD = sc.textFile("/home/hadoop/IdeaProjects/SparkJob/src/main/resources/emp_data.csv")
     // println(txtRDD.foreach(println))

      val empRDD = txtRDD.map {
        line =>
          val fields = line.split(",")
          val empno = fields(0)
          val designation = fields(1)
          val manager = fields(2)
          val hire_date = fields(3)
          val sal = fields(4)
          val deptno = fields(5)
          (empno, designation, manager, hire_date, sal, deptno)
      }
      val empDF  = empRDD.toDF("#No","Designation","Manager","Joining Date","Salary","Department#")
      empDF.show()
*/

      val eDF = sqlContext.read.format("csv").option("header","true").option("inferSchema","true").load("/home/hadoop/IdeaProjects/SparkJob/src/main/resources/emp_data.csv")
      eDF.show()
    }
}




No comments:

Post a Comment

Flume - Simple Demo

// create a folder in hdfs : $ hdfs dfs -mkdir /user/flumeExa // Create a shell script which generates : Hadoop in real world <n>...