Thursday, 13 August 2020

Inner, Left, Right, Full Joining of tables in Spark with Scala Examples

// Inner, Left, Right, Full Joining of 2 different tables with id as common column 
BasketA:
--------
1,Apple
2,Banana
3,Carrot
7,Mango
10,Radish

BasketB:
--------
1,Potato
2,Tomato
3,Apple
6,Banana
7,Carrot
8,Turnip

ohm.basektA:
// Create basketA table in ohm database and load data from local file
 
hive> create table basketA (id int, name string) row format delimited fields terminated by "," ;
OK
Time taken: 0.346 seconds

hive> load data local inpath 'basketA.csv' into table ohm.basketA;
Loading data to table ohm.basketa
Table ohm.basketa stats: [numFiles=1, totalSize=44]
OK
Time taken: 0.639 seconds


hive> select * from ohm.basketA;
OK
1 Apple
2 Banana
3 Carrot
7 Mango
10 Radish
Time taken: 0.539 seconds, Fetched: 5 row(s)



// create basketB tabel in ohm database and load data from local file
hive> create table basketB (id int, name string) row format delimited fields terminated by ",";
OK
Time taken: 0.064 seconds


hive> load data local inpath 'basketB.csv' into table ohm.basketB;
Loading data to table ohm.basketb
Table ohm.basketb stats: [numFiles=1, totalSize=53]
OK
Time taken: 0.18 seconds


hive> select * from ohm.basketB;
OK
1 Potato
2 Tomato
3 Apple
6 Banana
7 Carrot
8 Turnip
Time taken: 0.069 seconds, Fetched: 6 row(s)






scala> val dfA = spark.table("ohm.basketA")
dfA: org.apache.spark.sql.DataFrame = [id: int, name: string]

scala> dfA.printSchema()
root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)


scala> dfA.show()
+---+------+
| id|  name|
+---+------+
|  1| Apple|
|  2|Banana|
|  3|Carrot|
|  7| Mango|
| 10|Radish|
+---+------+




scala> val dfB = spark.table("ohm.basketB")
dfB: org.apache.spark.sql.DataFrame = [id: int, name: string]

scala> dfB.printSchema()
root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)


scala> dfB.show()
+---+------+
| id|  name|
+---+------+
|  1|Potato|
|  2|Tomato|
|  3| Apple|
|  6|Banana|
|  7|Carrot|
|  8|Turnip|
+---+------+


// Inner Join
scala> val innerJoined = dfA.join(dfB,dfA("id") === dfB("id"))
innerJoined: org.apache.spark.sql.DataFrame = [id: int, name: string ... 2 more fields]

scala> innerJoined.show()
+---+------+---+------+
| id|  name| id|  name|
+---+------+---+------+
|  1| Apple|  1|Potato|
|  2|Banana|  2|Tomato|
|  3|Carrot|  3| Apple|
|  7| Mango|  7|Carrot|
+---+------+---+------+




// Left Join
// All the rows from Left side and NULL rows from right side
scala>  val leftJoined = dfA.join(dfB,dfA("id") === dfB("id"),"Left")
leftJoined: org.apache.spark.sql.DataFrame = [id: int, name: string ... 2 more fields]

scala> leftJoined.show()
+---+------+----+------+
| id|  name|  id|  name|
+---+------+----+------+
|  1| Apple|   1|Potato|
|  2|Banana|   2|Tomato|
|  3|Carrot|   3| Apple|
|  7| Mango|   7|Carrot|
| 10|Radish|null|  null|
+---+------+----+------+


// Right Join ---> All the rows from right side and NULL filled rows in Left side
scala> val rightJoined = dfA.join(dfB, dfA("id") === dfB("id"),"right")
rightJoined: org.apache.spark.sql.DataFrame = [id: int, name: string ... 2 more fields]

scala> rightJoined.show()
+----+------+---+------+
|  id|  name| id|  name|
+----+------+---+------+
|   1| Apple|  1|Potato|
|   2|Banana|  2|Tomato|
|   3|Carrot|  3| Apple|
|null|  null|  6|Banana|
|   7| Mango|  7|Carrot|
|null|  null|  8|Turnip|
+----+------+---+------+



scala> val fullJoined = dfA.join(dfB, dfA("id") === dfB("id"),"full")
fullJoined: org.apache.spark.sql.DataFrame = [id: int, name: string ... 2 more fields]

scala> fullJoined.show()
+----+------+----+------+                                                       
|  id|  name|  id|  name|
+----+------+----+------+
|   1| Apple|   1|Potato|
|null|  null|   6|Banana|
|   3|Carrot|   3| Apple|
|null|  null|   8|Turnip|
|   7| Mango|   7|Carrot|
|  10|Radish|null|  null|
|   2|Banana|   2|Tomato|
+----+------+----+------+

No comments:

Post a Comment

Flume - Simple Demo

// create a folder in hdfs : $ hdfs dfs -mkdir /user/flumeExa // Create a shell script which generates : Hadoop in real world <n>...