Monday, 25 May 2020

Row Object in PySpark Examples

from pyspark.sql import Row
r1 = Row(id=100,name='Sara',salary=50000,city='Bangalore')
r2 = Row(id=101,salary=63000,name='Siva',city='Chennai')
r3 = Row(name='Malar',city='Mumbai',id=103,salary=63222)
df = spark.createDataFrame([r1,r2,r3])
df.show()

+---------+---+-----+------+
|     city| id| name|salary|
+---------+---+-----+------+
|Bangalore|100| Sara| 50000|
|  Chennai|101| Siva| 63000|
|   Mumbai|103|Malar| 63222|
+---------+---+-----+------+

df.select("id","name","salary","city").show()
+---+-----+------+---------+
| id| name|salary|     city|
+---+-----+------+---------+
|100| Sara| 50000|Bangalore|
|101| Siva| 63000|  Chennai|
|103|Malar| 63222|   Mumbai|
+---+-----+------+---------+



#with None
from pyspark.sql import Row
r1 = Row(id=100,name='Sara',city='Nellai',pin=627001)
r2 = Row(id=102,name=None,city='Kovai',pin=None)
r3 = Row(id=None,name='Raji',city=None,pin=None)
df = spark.createDataFrame([r1,r2,r3])
df.select("id","name","city","pin").show()

+----+----+------+------+
|  id|name|  city|   pin|
+----+----+------+------+
| 100|Sara|Nellai|627001|
| 102|null| Kovai|  null|
|null|Raji|  null|  null|
+----+----+------+------+

df.printSchema()

root
 |-- city: string (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- pin: long (nullable = true)



df.na.fill(-1).na.fill("NotProvided").show()
# fill -1 for all numeric fields and NotProvided for all string fields
+-----------+---+-----------+------+
|       city| id|       name|   pin|
+-----------+---+-----------+------+
|     Nellai|100|       Sara|627001|
|      Kovai|102|NotProvided|    -1|
|NotProvided| -1|       Raji|    -1|
+-----------+---+-----------+------+

#with datetime fields
#with None
from pyspark.sql import Row
r1 = Row(id=100,name='Sara',city='Nellai',pin=627001, doj='2014-12-23 23:34:45')
r2 = Row(id=102,name=None,city='Kovai',pin=None,doj=None)
r3 = Row(id=None,name='Raji',city=None,pin=None,doj='2010-01-01 12:34:22')
df = spark.createDataFrame([r1,r2,r3])
df.select("id","name",'doj',"city","pin").show()

+----+----+-------------------+------+------+
|  id|name|                doj|  city|   pin|
+----+----+-------------------+------+------+
| 100|Sara|2014-12-23 23:34:45|Nellai|627001|
| 102|null|               null| Kovai|  null|
|null|Raji|2010-01-01 12:34:22|  null|  null|
+----+----+-------------------+------+------+

root
 |-- city: string (nullable = true)
 |-- doj: string (nullable = true)  -- it is recognizing doj as string
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- pin: long (nullable = true)

No comments:

Post a Comment

Flume - Simple Demo

// create a folder in hdfs : $ hdfs dfs -mkdir /user/flumeExa // Create a shell script which generates : Hadoop in real world <n>...