Tuesday, 11 August 2020

Read, Write Parquet File using Spark


// select specific columns only 
scala> df.select("id","fname","lname","city").show(5)
+---+-------+---------+-----------+
| id|  fname|    lname|       city|
+---+-------+---------+-----------+
|  1|Richard|Hernandez|Brownsville|
|  2|   Mary|  Barrett|  Littleton|
|  3|    Ann|    Smith|     Caguas|
|  4|   Mary|    Jones| San Marcos|
|  5| Robert|   Hudson|     Caguas|
+---+-------+---------+-----------+
only showing top 5 rows


// Create a parquet file with the dataframe content
scala> df.select("id","fname","lname","city").write.format("parquet").save("/home/cloudera/sparkoutput/cust_parquet")


// display the parquet file content usign parquet-tools
$ parquet-tools head  -n5 /home/cloudera/sparkoutput/cust_parquet/part-00000-acbfe0c1-b743-47ce-860f-044dd4dffe72-c000.snappy.parquet 
id = 1
fname = Richard
lname = Hernandez
city = Brownsville

id = 2
fname = Mary
lname = Barrett
city = Littleton

id = 3
fname = Ann
lname = Smith
city = Caguas

id = 4
fname = Mary
lname = Jones
city = San Marcos

id = 5
fname = Robert
lname = Hudson
city = Caguas

// Display parquet file as json format - top 10 lines
$ parquet-tools cat --json /home/cloudera/sparkoutput/cust_parquet/part-00000-acbfe0c1-b743-47ce-860f-044dd4dffe72-c000.snappy.parquet | head
{"id":1,"fname":"Richard","lname":"Hernandez","city":"Brownsville"}
{"id":2,"fname":"Mary","lname":"Barrett","city":"Littleton"}
{"id":3,"fname":"Ann","lname":"Smith","city":"Caguas"}
{"id":4,"fname":"Mary","lname":"Jones","city":"San Marcos"}
{"id":5,"fname":"Robert","lname":"Hudson","city":"Caguas"}
{"id":6,"fname":"Mary","lname":"Smith","city":"Passaic"}
{"id":7,"fname":"Melissa","lname":"Wilcox","city":"Caguas"}
{"id":8,"fname":"Megan","lname":"Smith","city":"Lawrence"}
{"id":9,"fname":"Mary","lname":"Perez","city":"Caguas"}
{"id":10,"fname":"Melissa","lname":"Smith","city":"Stafford"}


// Read a parquet file using Spark 

scala> val dfParquet = spark.read.format("parquet").load("/home/cloudera/sparkoutput/cust_parquet")
dfParquet: org.apache.spark.sql.DataFrame = [id: bigint, fname: string ... 2 more fields]

scala> dfParquet.show(3)
+---+-------+---------+-----------+
| id|  fname|    lname|       city|
+---+-------+---------+-----------+
|  1|Richard|Hernandez|Brownsville|
|  2|   Mary|  Barrett|  Littleton|
|  3|    Ann|    Smith|     Caguas|
+---+-------+---------+-----------+
only showing top 3 rows


scala> dfParquet.printSchema()
root
 |-- id: long (nullable = true)
 |-- fname: string (nullable = true)
 |-- lname: string (nullable = true)
 |-- city: string (nullable = true)

No comments:

Post a Comment

Flume - Simple Demo

// create a folder in hdfs : $ hdfs dfs -mkdir /user/flumeExa // Create a shell script which generates : Hadoop in real world <n>...