Sankara's Big Data Notes: Inner, Left, Right, Full Joining of tables in Spark with Scala Examples

Thursday, 13 August 2020

Inner, Left, Right, Full Joining of tables in Spark with Scala Examples

// Inner, Left, Right, Full Joining of 2 different tables with id as common column

BasketA:

--------

1,Apple

2,Banana

3,Carrot

7,Mango

10,Radish

BasketB:

--------

1,Potato

2,Tomato

3,Apple

6,Banana

7,Carrot

8,Turnip

ohm.basektA:

// Create basketA table in ohm database and load data from local file

hive> create table basketA (id int, name string) row format delimited fields terminated by "," ;

Time taken: 0.346 seconds

hive> load data local inpath 'basketA.csv' into table ohm.basketA;

Loading data to table ohm.basketa

Table ohm.basketa stats: [numFiles=1, totalSize=44]

Time taken: 0.639 seconds

hive> select * from ohm.basketA;

1 Apple

2 Banana

3 Carrot

7 Mango

10 Radish

Time taken: 0.539 seconds, Fetched: 5 row(s)

// create basketB tabel in ohm database and load data from local file

hive> create table basketB (id int, name string) row format delimited fields terminated by ",";

Time taken: 0.064 seconds

hive> load data local inpath 'basketB.csv' into table ohm.basketB;

Loading data to table ohm.basketb

Table ohm.basketb stats: [numFiles=1, totalSize=53]

Time taken: 0.18 seconds

hive> select * from ohm.basketB;

1 Potato

2 Tomato

3 Apple

6 Banana

7 Carrot

8 Turnip

Time taken: 0.069 seconds, Fetched: 6 row(s)

scala> val dfA = spark.table("ohm.basketA")

dfA: org.apache.spark.sql.DataFrame = [id: int, name: string]

scala> dfA.printSchema()

root

|-- id: integer (nullable = true)

|-- name: string (nullable = true)

scala> dfA.show()

+---+------+

| id| name|

+---+------+

| 1| Apple|

| 2|Banana|

| 3|Carrot|

| 7| Mango|

| 10|Radish|

+---+------+

scala> val dfB = spark.table("ohm.basketB")

dfB: org.apache.spark.sql.DataFrame = [id: int, name: string]

scala> dfB.printSchema()

root

|-- id: integer (nullable = true)

|-- name: string (nullable = true)

scala> dfB.show()

+---+------+

| id| name|

+---+------+

| 1|Potato|

| 2|Tomato|

| 3| Apple|

| 6|Banana|

| 7|Carrot|

| 8|Turnip|

+---+------+

// Inner Join

scala> val innerJoined = dfA.join(dfB,dfA("id") === dfB("id"))

innerJoined: org.apache.spark.sql.DataFrame = [id: int, name: string ... 2 more fields]

scala> innerJoined.show()

+---+------+---+------+

| id| name| id| name|

+---+------+---+------+

| 1| Apple| 1|Potato|

| 2|Banana| 2|Tomato|

| 3|Carrot| 3| Apple|

| 7| Mango| 7|Carrot|

+---+------+---+------+

// Left Join

// All the rows from Left side and NULL rows from right side

scala> val leftJoined = dfA.join(dfB,dfA("id") === dfB("id"),"Left")

leftJoined: org.apache.spark.sql.DataFrame = [id: int, name: string ... 2 more fields]

scala> leftJoined.show()

+---+------+----+------+

| id| name| id| name|

+---+------+----+------+

| 1| Apple| 1|Potato|

| 2|Banana| 2|Tomato|

| 3|Carrot| 3| Apple|

| 7| Mango| 7|Carrot|

+---+------+----+------+

// Right Join ---> All the rows from right side and NULL filled rows in Left side

scala> val rightJoined = dfA.join(dfB, dfA("id") === dfB("id"),"right")

rightJoined: org.apache.spark.sql.DataFrame = [id: int, name: string ... 2 more fields]

scala> rightJoined.show()

+----+------+---+------+

| id| name| id| name|

+----+------+---+------+

| 1| Apple| 1|Potato|

| 2|Banana| 2|Tomato|

| 3|Carrot| 3| Apple|

| 7| Mango| 7|Carrot|

+----+------+---+------+

scala> val fullJoined = dfA.join(dfB, dfA("id") === dfB("id"),"full")

fullJoined: org.apache.spark.sql.DataFrame = [id: int, name: string ... 2 more fields]

scala> fullJoined.show()

+----+------+----+------+

| id| name| id| name|

+----+------+----+------+

| 1| Apple| 1|Potato|

| 3|Carrot| 3| Apple|

| 7| Mango| 7|Carrot|

| 2|Banana| 2|Tomato|

+----+------+----+------+

Sankara's Big Data Notes

Thursday, 13 August 2020

Inner, Left, Right, Full Joining of tables in Spark with Scala Examples

No comments:

Post a Comment

Flume - Simple Demo