// How to add auto generated column Index to existing dataframe?
scala> import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions._
scala> val myDF = Seq( ("Raja","Bsc"), ("Ravi","BE"), ("Anil","MA"), ("Siva","Msc"), ("Rani","MA"), ("Kumar","Bsc")).toDF("name","qual")
myDF: org.apache.spark.sql.DataFrame = [name: string, qual: string]
// Here we add the autogenerated column
scala> val df1 = myDF.withColumn("id",monotonicallyIncreasingId)
warning: there was one deprecation warning; re-run with -deprecation for details
df1: org.apache.spark.sql.DataFrame = [name: string, qual: string ... 1 more field]
scala> df1.printSchema()
root
|-- name: string (nullable = true)
|-- qual: string (nullable = true)
|-- id: long (nullable = false)
scala> df1.show()
+-----+----+---+
| name|qual| id|
+-----+----+---+
| Raja| Bsc| 0|
| Ravi| BE| 1|
| Anil| MA| 2|
| Siva| Msc| 3|
| Rani| MA| 4|
|Kumar| Bsc| 5|
+-----+----+---+
// Remove an existing column
scala> val df2 = df1.drop($"id")
df2: org.apache.spark.sql.DataFrame = [name: string, qual: string]
scala> df2.printSchema()
root
|-- name: string (nullable = true)
|-- qual: string (nullable = true)
scala> df2.show()
+-----+----+
| name|qual|
+-----+----+
| Raja| Bsc|
| Ravi| BE|
| Anil| MA|
| Siva| Msc|
| Rani| MA|
|Kumar| Bsc|
+-----+----+
// namespace for Spark Windowing functions
import org.apache.spark.sql.expressions.Window
scala> val df = df2.withColumn("id",row_number().over(Window.orderBy("name")))
df: org.apache.spark.sql.DataFrame = [name: string, qual: string ... 1 more field]
scala> df.show()
20/08/12 02:56:03 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
+-----+----+---+
| name|qual| id|
+-----+----+---+
| Anil| MA| 1|
|Kumar| Bsc| 2|
| Raja| Bsc| 3|
| Rani| MA| 4|
| Ravi| BE| 5|
| Siva| Msc| 6|
+-----+----+---+