Saturday, 2 February 2019

Cars Data Analysis using Apache Spark

Input
cars.txt:

make Model MPG Cylinders Engine Disp Horsepower Weight Accelerate Year Origin
amc amc ambassador dpl 15 8 390 190 3850 8.5 70 American
amc amc gremlin 21 6 199 90 2648 15 70 American
amc amc hornet 18 6 199 97 2774 15.5 70 American
amc amc rebel sst 16 8 304 150 3433 12 70 American
buick buick estate wagon (sw) 14 8 455 225 3086 10 70 American
buick buick skylark 320 15 8 350 165 3693 11.5 70 American
chevrolet chevrolet chevelle malibu 18 8 307 130 3504 12 70 American
chevrolet chevrolet impala 14 8 454 220 4354 9 70 American
chevrolet chevrolet monte carlo 15 8 400 150 3761 9.5 70 American
chevy chevy c20 10 8 307 200 4376 15 70 American
dodge dodge challenger se 15 8 383 170 3563 10 70 American
dodge dodge d200 11 8 318 210 4382 13.5 70 American
ford ford f250 10 8 360 215 4615 14 70 American
ford ford galaxie 500 15 8 429 198 4341 10 70 American
ford ford maverick 21 6 200 85 2587 16 70 American
ford ford torino 17 8 302 140 3449 10.5 70 American
hi hi 1200d 9 8 304 193 4732 18.5 70 American
plymouth plymouth 'cuda 340 14 8 340 160 3609 8 70 American
plymouth plymouth duster 22 6 198 95 2833 15.5 70 American
plymouth plymouth fury iii 14 8 440 215 4312 8.5 70 American
plymouth plymouth satellite 18 8 318 150 3436 11 70 American
pontiac pontiac catalina 14 8 455 225 4425 10 70 American
audi audi 100 ls 24 4 107 90 2430 14.5 70 European
bmw bmw 2002 26 4 121 113 2234 12.5 70 European
peugeot peugeot 504 25 4 110 87 2672 17.5 70 European
saab saab 99e 25 4 104 95 2375 17.5 70 European
volkswagen volkswagen 1131 deluxe sedan 26 4 97 46 1835 20.5 70 European
datsun datsun pl510 27 4 97 88 2130 14.5 70 Japanese
toyato toyota corona mark ii 24 4 113 95 2372 15 70 Japanese
amc amc gremlin 19 6 232 100 2634 13 71 American
amc amc hornet sportabout (sw) 18 6 258 110 2962 13.5 71 American
amc amc matador 18 6 232 100 3288 15.5 71 American
chevrolet chevrolet chevelle malibu 17 6 250 100 3329 15.5 71 American
chevrolet chevrolet impala 14 8 350 165 4209 12 71 American
chevrolet chevrolet vega (sw) 22 4 140 72 2408 19 71 American
chevrolet chevrolet vega 2300 28 4 140 90 2264 15.5 71 American
dodge dodge monaco (sw) 12 8 383 180 4955 11.5 71 American
ford ford country squire (sw) 13 8 400 170 4746 12 71 American
ford ford galaxie 500 14 8 351 153 4154 13.5 71 American
ford ford mustang 18 6 250 88 3139 14.5 71 American
ford ford torino 500 19 6 250 88 3302 15.5 71 American
mercury mercury capri 2000 23 4 122 86 2220 14 71 American
plymouth plymouth cricket 26 4 91 70 1955 20.5 71 American
plymouth plymouth fury iii 14 8 318 150 4096 13 71 American
plymouth plymouth satellite custom 16 6 225 105 3439 15.5 71 American
pontiac pontiac catalina brougham 14 8 400 175 4464 11.5 71 American
pontiac pontiac firebird 19 6 250 100 3282 15 71 American
pontiac pontiac safari (sw) 13 8 400 175 5140 12 71 American
fiat fiat 124b 30 4 88 76 2065 14.5 71 European
opel opel 1900 28 4 116 90 2123 14 71 European
peugeot peugeot 304 30 4 79 70 2074 19.5 71 European
volkswagen volkswagen model 111 27 4 97 60 1834 19 71 European
datsun datsun 1200 35 4 72 69 1613 18 71 Japanese
datsun datsun pl510 27 4 97 88 2130 14.5 71 Japanese
toyato toyota corolla 1200 31 4 71 65 1773 19 71 Japanese
toyato toyota corona 25 4 113 95 2228 14 71 Japanese
amc amc ambassador sst 17 8 304 150 3672 11.5 72 American
amc amc matador (sw) 15 8 304 150 3892 12.5 72 American
buick buick lesabre custom 13 8 350 155 4502 13.5 72 American
chevrolet chevrolet chevelle concours (sw) 13 8 307 130 4098 14 72 American
chevrolet chevrolet impala 13 8 350 165 4274 12 72 American
chevrolet chevrolet vega 20 4 140 90 2408 19.5 72 American



hadoop@hadoop:~/Desktop/vow$ hdfs dfs -copyFromLocal cars.txt /user/


hadoop@hadoop:~/Desktop/vow$ hdfs dfs -head /user/cars.txt
make Model MPG Cylinders Engine Disp Horsepower Weight Accelerate Year Origin
amc amc ambassador dpl 15 8 390 190 3850 8.5 70 American
amc amc gremlin 21 6 199 90 2648 15 70 American
amc amc hornet 18 6 199 97 2774 15.5 70 American
amc amc rebel sst 16 8 304 150 3433 12 70 American

scala> val carsInputRDD = sc.textFile("hdfs://localhost:9000/user/cars.txt")
carsInputRDD: org.apache.spark.rdd.RDD[String] = hdfs://localhost:9000/user/cars.txt MapPartitionsRDD[3] at textFile at <console>:24


scala> val header = carsInputRDD.first
header: String = make Model MPG Cylinders Engine Disp Horsepower Weight Accelerate Year Origin

scala> val headerless = carsInputRDD.filter ( x => x != header)
headerless: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[5] at filter at <console>:29


scala > case class cars (make:String, model:String, mpg:Integer, cylinders:Integer, engine_disp:Integer, horsepower:Integer, weight:Integer, accelerate:Double, year:Integer, origin:String)

scala > val carsParsedRDD = headerless.map(x => {
       val fields = x.split("\t")
       val make=fields(0).toString
       val model=fields(1).toString
       val mpg = fields(2).toInt
       val cylinders = fields(3).toInt
       val engine_disp = fields(4).toInt
       val horsepower = fields(5).toInt
       val weight = fields(6).toInt
       val accelerate = fields(7).toDouble
       val year = fields(8).toInt
       val origin = fields(9).toString
       cars(make,model,mpg,cylinders,engine_disp,horsepower,weight,accelerate,year,origin)
       })
 
//persist to memory
scala> carsParsedRDD.cache()


scala> carsParsedRDD.take(3).foreach(println)
cars(amc,amc ambassador dpl,15,8,390,190,3850,8.5,70,American)
cars(amc,amc gremlin,21,6,199,90,2648,15.0,70,American)
cars(amc,amc hornet,18,6,199,97,2774,15.5,70,American)


//count cars origin wise
val originwiseCount = carsParsedRDD.map(x=>(x.origin,1)).reduceByKey((x,y)=>x+y).collect

scala> originwiseCount.foreach(println)
(American,47)
(European,9)
(Japanese,6)

// Find Only American Cars
scala> val americanCars = carsParsedRDD.filter ( x => x.origin =="American")
americanCars: org.apache.spark.rdd.RDD[cars] = MapPartitionsRDD[11] at filter at <console>:27

// Counting American Cars
scala> americanCars.count
res10: Long = 47

scala> americanCars.foreach(println)
cars(amc,amc ambassador dpl,15,8,390,190,3850,8.5,70,American)
cars(amc,amc gremlin,21,6,199,90,2648,15.0,70,American)
cars(amc,amc hornet,18,6,199,97,2774,15.5,70,American)
cars(amc,amc rebel sst,16,8,304,150,3433,12.0,70,American)
cars(buick,buick estate wagon (sw),14,8,455,225,3086,10.0,70,American)
cars(buick,buick skylark 320,15,8,350,165,3693,11.5,70,American)
cars(chevrolet,chevrolet chevelle malibu,18,8,307,130,3504,12.0,70,American)
cars(chevrolet,chevrolet impala,14,8,454,220,4354,9.0,70,American)
cars(chevrolet,chevrolet monte carlo,15,8,400,150,3761,9.5,70,American)
cars(chevy,chevy c20,10,8,307,200,4376,15.0,70,American)
cars(dodge,dodge challenger se,15,8,383,170,3563,10.0,70,American)
cars(dodge,dodge d200,11,8,318,210,4382,13.5,70,American)
cars(ford,ford f250,10,8,360,215,4615,14.0,70,American)
cars(ford,ford galaxie 500,15,8,429,198,4341,10.0,70,American)
cars(ford,ford maverick,21,6,200,85,2587,16.0,70,American)
cars(ford,ford torino,17,8,302,140,3449,10.5,70,American)
cars(hi,hi 1200d,9,8,304,193,4732,18.5,70,American)
cars(plymouth,plymouth 'cuda 340,14,8,340,160,3609,8.0,70,American)
cars(plymouth,plymouth duster,22,6,198,95,2833,15.5,70,American)
cars(plymouth,plymouth fury iii,14,8,440,215,4312,8.5,70,American)
cars(plymouth,plymouth satellite,18,8,318,150,3436,11.0,70,American)
cars(pontiac,pontiac catalina,14,8,455,225,4425,10.0,70,American)
cars(amc,amc gremlin,19,6,232,100,2634,13.0,71,American)
cars(amc,amc hornet sportabout (sw),18,6,258,110,2962,13.5,71,American)
cars(amc,amc matador,18,6,232,100,3288,15.5,71,American)
cars(chevrolet,chevrolet chevelle malibu,17,6,250,100,3329,15.5,71,American)
cars(chevrolet,chevrolet impala,14,8,350,165,4209,12.0,71,American)
cars(chevrolet,chevrolet vega (sw),22,4,140,72,2408,19.0,71,American)
cars(chevrolet,chevrolet vega 2300,28,4,140,90,2264,15.5,71,American)
cars(dodge,dodge monaco (sw),12,8,383,180,4955,11.5,71,American)
cars(ford,ford country squire (sw),13,8,400,170,4746,12.0,71,American)
cars(ford,ford galaxie 500,14,8,351,153,4154,13.5,71,American)
cars(ford,ford mustang,18,6,250,88,3139,14.5,71,American)
cars(ford,ford torino 500,19,6,250,88,3302,15.5,71,American)
cars(mercury,mercury capri 2000,23,4,122,86,2220,14.0,71,American)
cars(plymouth,plymouth cricket,26,4,91,70,1955,20.5,71,American)
cars(plymouth,plymouth fury iii,14,8,318,150,4096,13.0,71,American)
cars(plymouth,plymouth satellite custom,16,6,225,105,3439,15.5,71,American)
cars(pontiac,pontiac catalina brougham,14,8,400,175,4464,11.5,71,American)
cars(pontiac,pontiac firebird,19,6,250,100,3282,15.0,71,American)
cars(pontiac,pontiac safari (sw),13,8,400,175,5140,12.0,71,American)
cars(amc,amc ambassador sst,17,8,304,150,3672,11.5,72,American)
cars(amc,amc matador (sw),15,8,304,150,3892,12.5,72,American)
cars(buick,buick lesabre custom,13,8,350,155,4502,13.5,72,American)
cars(chevrolet,chevrolet chevelle concours (sw),13,8,307,130,4098,14.0,72,American)
cars(chevrolet,chevrolet impala,13,8,350,165,4274,12.0,72,American)
cars(chevrolet,chevrolet vega,20,4,140,90,2408,19.5,72,American)



val makeWeightSum=americanCars.map(x=>(x.make,x.weight.toInt)).combineByKey((x:Int) => (x, 1),(acc:(Int, Int), x) => (acc._1 + x, acc._2 + 1),(acc1:(Int, Int), acc2:(Int, Int)) => (acc1._1 + acc2._1, acc1._2 + acc2._2))


scala> makeWeightSum.collect.foreach(println)
(hi,(4732,1))
(chevy,(4376,1))
(chevrolet,(34609,10))
(plymouth,(23680,7))
(buick,(11281,3))
(mercury,(2220,1))
(amc,(29153,9))
(dodge,(12900,3))
(pontiac,(17311,4))
(ford,(30333,8))

scala> val makeWeightAvg=makeWeightSum.map(x=>(x._1,(x._2._1/x._2._2)))
makeWeightAvg: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[16] at map at <console>:27

scala> makeWeightAvg.foreach(println)
(hi,4732)
(chevy,4376)
(chevrolet,3460)
(plymouth,3382)
(buick,3760)
(mercury,2220)
(amc,3239)
(dodge,4300)
(pontiac,4327)
(ford,3791)





Flume - Simple Demo

// create a folder in hdfs : $ hdfs dfs -mkdir /user/flumeExa // Create a shell script which generates : Hadoop in real world <n>...