// select specific columns only
scala> df.select("id","fname","lname","city").show(5)
+---+-------+---------+-----------+
| id| fname| lname| city|
+---+-------+---------+-----------+
| 1|Richard|Hernandez|Brownsville|
| 2| Mary| Barrett| Littleton|
| 3| Ann| Smith| Caguas|
| 4| Mary| Jones| San Marcos|
| 5| Robert| Hudson| Caguas|
+---+-------+---------+-----------+
only showing top 5 rows
// Create a parquet file with the dataframe content
scala> df.select("id","fname","lname","city").write.format("parquet").save("/home/cloudera/sparkoutput/cust_parquet")
// display the parquet file content usign parquet-tools
$ parquet-tools head -n5 /home/cloudera/sparkoutput/cust_parquet/part-00000-acbfe0c1-b743-47ce-860f-044dd4dffe72-c000.snappy.parquet
id = 1
fname = Richard
lname = Hernandez
city = Brownsville
id = 2
fname = Mary
lname = Barrett
city = Littleton
id = 3
fname = Ann
lname = Smith
city = Caguas
id = 4
fname = Mary
lname = Jones
city = San Marcos
id = 5
fname = Robert
lname = Hudson
city = Caguas
// Display parquet file as json format - top 10 lines
$ parquet-tools cat --json /home/cloudera/sparkoutput/cust_parquet/part-00000-acbfe0c1-b743-47ce-860f-044dd4dffe72-c000.snappy.parquet | head
{"id":1,"fname":"Richard","lname":"Hernandez","city":"Brownsville"}
{"id":2,"fname":"Mary","lname":"Barrett","city":"Littleton"}
{"id":3,"fname":"Ann","lname":"Smith","city":"Caguas"}
{"id":4,"fname":"Mary","lname":"Jones","city":"San Marcos"}
{"id":5,"fname":"Robert","lname":"Hudson","city":"Caguas"}
{"id":6,"fname":"Mary","lname":"Smith","city":"Passaic"}
{"id":7,"fname":"Melissa","lname":"Wilcox","city":"Caguas"}
{"id":8,"fname":"Megan","lname":"Smith","city":"Lawrence"}
{"id":9,"fname":"Mary","lname":"Perez","city":"Caguas"}
{"id":10,"fname":"Melissa","lname":"Smith","city":"Stafford"}
// Read a parquet file using Spark
scala> val dfParquet = spark.read.format("parquet").load("/home/cloudera/sparkoutput/cust_parquet")
dfParquet: org.apache.spark.sql.DataFrame = [id: bigint, fname: string ... 2 more fields]
scala> dfParquet.show(3)
+---+-------+---------+-----------+
| id| fname| lname| city|
+---+-------+---------+-----------+
| 1|Richard|Hernandez|Brownsville|
| 2| Mary| Barrett| Littleton|
| 3| Ann| Smith| Caguas|
+---+-------+---------+-----------+
only showing top 3 rows
scala> dfParquet.printSchema()
root
|-- id: long (nullable = true)
|-- fname: string (nullable = true)
|-- lname: string (nullable = true)
|-- city: string (nullable = true)
No comments:
Post a Comment