tblC:
Id,Name,Age
100,Anbu,55
101,Siva,33
100,Anbu,55
102,Kalai,33
103,Arivu,22
101,Siva,33
104,Ashok,52
105,Priya,22
scala> val dfC = spark.read.format("csv").option("header","True").option("inferSchema","True").load("D:\\Ex\\My\\tblC.txt")
dfC: org.apache.spark.sql.DataFrame = [Id: int, Name: string ... 1 more field]
scala> dfC.printSchema()
root
|-- Id: integer (nullable = true)
|-- Name: string (nullable = true)
|-- Age: integer (nullable = true)
scala> dfC.show()
+---+-----+---+
| Id| Name|Age|
+---+-----+---+
|100| Anbu| 55| -
|101| Siva| 33| -
|100| Anbu| 55| -
|102|Kalai| 33|
|103|Arivu| 22|
|101| Siva| 33| -
|104|Ashok| 52|
|105|Priya| 22|
+---+-----+---+
scala> dfC.distinct.show()
+---+-----+---+
| Id| Name|Age|
+---+-----+---+
|104|Ashok| 52|
|101| Siva| 33|
|105|Priya| 22|
|102|Kalai| 33|
|100| Anbu| 55|
|103|Arivu| 22|
+---+-----+---+
// Drop Duplicates
scala> val dfNoDuplicates = dfC.dropDuplicates()
dfNoDuplicates: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Id: int, Name: string ... 1 more field]
scala> dfNoDuplicates.show()
+---+-----+---+
| Id| Name|Age|
+---+-----+---+
|104|Ashok| 52|
|101| Siva| 33|
|105|Priya| 22|
|102|Kalai| 33|
|100| Anbu| 55|
|103|Arivu| 22|
+---+-----+---+
No comments:
Post a Comment