Wednesday, 12 August 2020

How to add auto generated column Index to existing dataframe? | Interview Questions



// How to add auto generated column Index to existing dataframe?

scala> import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions._

scala> val myDF = Seq( ("Raja","Bsc"), ("Ravi","BE"), ("Anil","MA"), ("Siva","Msc"), ("Rani","MA"), ("Kumar","Bsc")).toDF("name","qual")
myDF: org.apache.spark.sql.DataFrame = [name: string, qual: string]


// Here we add the autogenerated column 
scala> val df1 = myDF.withColumn("id",monotonicallyIncreasingId)
warning: there was one deprecation warning; re-run with -deprecation for details
df1: org.apache.spark.sql.DataFrame = [name: string, qual: string ... 1 more field]

scala> df1.printSchema()
root
 |-- name: string (nullable = true)
 |-- qual: string (nullable = true)
 |-- id: long (nullable = false)


scala> df1.show()
+-----+----+---+
| name|qual| id|
+-----+----+---+
| Raja| Bsc|  0|
| Ravi|  BE|  1|
| Anil|  MA|  2|
| Siva| Msc|  3|
| Rani|  MA|  4|
|Kumar| Bsc|  5|
+-----+----+---+

// Remove an existing column 
scala> val df2 = df1.drop($"id")
df2: org.apache.spark.sql.DataFrame = [name: string, qual: string]

scala> df2.printSchema()
root
 |-- name: string (nullable = true)
 |-- qual: string (nullable = true)


scala> df2.show()
+-----+----+
| name|qual|
+-----+----+
| Raja| Bsc|
| Ravi|  BE|
| Anil|  MA|
| Siva| Msc|
| Rani|  MA|
|Kumar| Bsc|
+-----+----+

// namespace for Spark Windowing functions
import org.apache.spark.sql.expressions.Window


scala> val df = df2.withColumn("id",row_number().over(Window.orderBy("name")))
df: org.apache.spark.sql.DataFrame = [name: string, qual: string ... 1 more field]


scala> df.show()
20/08/12 02:56:03 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
+-----+----+---+
| name|qual| id|
+-----+----+---+
| Anil|  MA|  1|
|Kumar| Bsc|  2|
| Raja| Bsc|  3|
| Rani|  MA|  4|
| Ravi|  BE|  5|
| Siva| Msc|  6|
+-----+----+---+

No comments:

Post a Comment

Flume - Simple Demo

// create a folder in hdfs : $ hdfs dfs -mkdir /user/flumeExa // Create a shell script which generates : Hadoop in real world <n>...