Remove Leading and Trailing spaces for String columns in a dataframe in Spark with Scala
Id,Name,Age
100,Anbu ,55
101, Siva,33
100,Anbu ,55
102, Kalai ,33
103, Arivu ,22
101, Siva ,33
104, Ashok ,52
105, Priya ,22
scala> val dfC = spark.read.format("csv").option("header","True").option("inferSchema","True").load("D:\\Ex\\My\\tblC.txt")
dfC: org.apache.spark.sql.DataFrame = [Id: int, Name: string ... 1 more field]
// Have a look @ here. Name column has Leading and Trailing spaces.
scala> dfC.show()
+---+-------------+---+
| Id| Name|Age|
+---+-------------+---+
|100| Anbu | 55|
|101| Siva| 33|
|100| Anbu | 55|
|102| Kalai | 33|
|103| Arivu | 22|
|101| Siva | 33|
|104| Ashok | 52|
|105| Priya | 22|
+---+-------------+---+
//option("ignoreLeadingWhiteSpace", "true").option("ignoreTrailingWhiteSpace", "true")
scala> val dfC = spark.read.format("csv").option("header","True").option("inferSchema","True").option("ignoreLeadingWhiteSpace", "true").option("ignoreTrailingWhiteSpace", "true").load("D:\\Ex\\My\\tblC.txt")
dfC: org.apache.spark.sql.DataFrame = [Id: int, Name: string ... 1 more field]
// Here it looks good
scala> dfC.show()
+---+-----+---+
| Id| Name|Age|
+---+-----+---+
|100| Anbu| 55|
|101| Siva| 33|
|100| Anbu| 55|
|102|Kalai| 33|
|103|Arivu| 22|
|101| Siva| 33|
|104|Ashok| 52|
|105|Priya| 22|
+---+-----+---+
No comments:
Post a Comment