// Inner, Left, Right, Full Joining of 2 different tables with id as common column
BasketA:
--------
1,Apple
2,Banana
3,Carrot
7,Mango
10,Radish
BasketB:
--------
1,Potato
2,Tomato
3,Apple
6,Banana
7,Carrot
8,Turnip
ohm.basektA:
// Create basketA table in ohm database and load data from local file
hive> create table basketA (id int, name string) row format delimited fields terminated by "," ;
OK
Time taken: 0.346 seconds
hive> load data local inpath 'basketA.csv' into table ohm.basketA;
Loading data to table ohm.basketa
Table ohm.basketa stats: [numFiles=1, totalSize=44]
OK
Time taken: 0.639 seconds
hive> select * from ohm.basketA;
OK
1 Apple
2 Banana
3 Carrot
7 Mango
10 Radish
Time taken: 0.539 seconds, Fetched: 5 row(s)
// create basketB tabel in ohm database and load data from local file
hive> create table basketB (id int, name string) row format delimited fields terminated by ",";
OK
Time taken: 0.064 seconds
hive> load data local inpath 'basketB.csv' into table ohm.basketB;
Loading data to table ohm.basketb
Table ohm.basketb stats: [numFiles=1, totalSize=53]
OK
Time taken: 0.18 seconds
hive> select * from ohm.basketB;
OK
1 Potato
2 Tomato
3 Apple
6 Banana
7 Carrot
8 Turnip
Time taken: 0.069 seconds, Fetched: 6 row(s)
scala> val dfA = spark.table("ohm.basketA")
dfA: org.apache.spark.sql.DataFrame = [id: int, name: string]
scala> dfA.printSchema()
root
|-- id: integer (nullable = true)
|-- name: string (nullable = true)
scala> dfA.show()
+---+------+
| id| name|
+---+------+
| 1| Apple|
| 2|Banana|
| 3|Carrot|
| 7| Mango|
| 10|Radish|
+---+------+
scala> val dfB = spark.table("ohm.basketB")
dfB: org.apache.spark.sql.DataFrame = [id: int, name: string]
scala> dfB.printSchema()
root
|-- id: integer (nullable = true)
|-- name: string (nullable = true)
scala> dfB.show()
+---+------+
| id| name|
+---+------+
| 1|Potato|
| 2|Tomato|
| 3| Apple|
| 6|Banana|
| 7|Carrot|
| 8|Turnip|
+---+------+
// Inner Join
scala> val innerJoined = dfA.join(dfB,dfA("id") === dfB("id"))
innerJoined: org.apache.spark.sql.DataFrame = [id: int, name: string ... 2 more fields]
scala> innerJoined.show()
+---+------+---+------+
| id| name| id| name|
+---+------+---+------+
| 1| Apple| 1|Potato|
| 2|Banana| 2|Tomato|
| 3|Carrot| 3| Apple|
| 7| Mango| 7|Carrot|
+---+------+---+------+
// Left Join
// All the rows from Left side and NULL rows from right side
scala> val leftJoined = dfA.join(dfB,dfA("id") === dfB("id"),"Left")
leftJoined: org.apache.spark.sql.DataFrame = [id: int, name: string ... 2 more fields]
scala> leftJoined.show()
+---+------+----+------+
| id| name| id| name|
+---+------+----+------+
| 1| Apple| 1|Potato|
| 2|Banana| 2|Tomato|
| 3|Carrot| 3| Apple|
| 7| Mango| 7|Carrot|
| 10|Radish|null| null|
+---+------+----+------+
// Right Join ---> All the rows from right side and NULL filled rows in Left side
scala> val rightJoined = dfA.join(dfB, dfA("id") === dfB("id"),"right")
rightJoined: org.apache.spark.sql.DataFrame = [id: int, name: string ... 2 more fields]
scala> rightJoined.show()
+----+------+---+------+
| id| name| id| name|
+----+------+---+------+
| 1| Apple| 1|Potato|
| 2|Banana| 2|Tomato|
| 3|Carrot| 3| Apple|
|null| null| 6|Banana|
| 7| Mango| 7|Carrot|
|null| null| 8|Turnip|
+----+------+---+------+
scala> val fullJoined = dfA.join(dfB, dfA("id") === dfB("id"),"full")
fullJoined: org.apache.spark.sql.DataFrame = [id: int, name: string ... 2 more fields]
scala> fullJoined.show()
+----+------+----+------+
| id| name| id| name|
+----+------+----+------+
| 1| Apple| 1|Potato|
|null| null| 6|Banana|
| 3|Carrot| 3| Apple|
|null| null| 8|Turnip|
| 7| Mango| 7|Carrot|
| 10|Radish|null| null|
| 2|Banana| 2|Tomato|
+----+------+----+------+
No comments:
Post a Comment