Start pyspark with --packages option:
pyspark --packages com.databricks:spark-csv_2.11:1.5.0,com.databricks:spark-xml_2.11:0.5.0,com.databricks:spark-avro_2.11:4.0.0
df = spark.read.format("xml").option("rowTag","book").load("E:\\DataSets\\books.xml")
df.printSchema()
root
|-- _id: string (nullable = true)
|-- author: string (nullable = true)
|-- description: string (nullable = true)
|-- genre: string (nullable = true)
|-- price: double (nullable = true)
|-- publish_date: string (nullable = true)
|-- title: string (nullable = true)
df.show(5)
+-----+--------------------+--------------------+--------+-----+------------+--------------------+
| _id| author| description| genre|price|publish_date| title|
+-----+--------------------+--------------------+--------+-----+------------+--------------------+
|bk101|Gambardella, Matthew|An in-depth look ...|Computer|44.95| 2000-10-01|XML Developer's G...|
|bk102| Ralls, Kim|A former architec...| Fantasy| 5.95| 2000-12-16| Midnight Rain|
|bk103| Corets, Eva|After the collaps...| Fantasy| 5.95| 2000-11-17| Maeve Ascendant|
|bk104| Corets, Eva|In post-apocalyps...| Fantasy| 5.95| 2001-03-10| Oberon's Legacy|
|bk105| Corets, Eva|The two daughters...| Fantasy| 5.95| 2001-09-10| The Sundered Grail|
+-----+--------------------+--------------------+--------+-----+------------+--------------------+
only showing top 5 rows
df.filter("author='Corets, Eva' and genre='Fantasy' and publish_date='2001-09-10'").show()
+-----+-----------+--------------------+-------+-----+------------+------------------+
| _id| author| description| genre|price|publish_date| title|
+-----+-----------+--------------------+-------+-----+------------+------------------+
|bk105|Corets, Eva|The two daughters...|Fantasy| 5.95| 2001-09-10|The Sundered Grail|
+-----+-----------+--------------------+-------+-----+------------+------------------+
#Multilevel json - json with struct fields inside
df = spark.read.format("xml").option("rowTag","Movie").load("E:\\DataSets\\Movies.xml")
df.printSchema()
root
|-- Director: struct (nullable = true)
| |-- Name: struct (nullable = true)
| | |-- First: string (nullable = true)
| | |-- Last: string (nullable = true)
| | |-- _highratedmovie: string (nullable = true)
|-- Genre: string (nullable = true)
|-- Studio: string (nullable = true)
|-- Title: struct (nullable = true)
| |-- _VALUE: string (nullable = true)
| |-- _runtime: long (nullable = true)
|-- Year: double (nullable = true)
|-- _rating: string (nullable = true)
df.show(5)
+--------------------+-----------------+--------------------+--------------------+------+-------+
| Director| Genre| Studio| Title| Year|_rating|
+--------------------+-----------------+--------------------+--------------------+------+-------+
|[[Francis Ford , ...| Crime Drama | Paramount Pictures |[The Godfather, 177]|1972.0| R|
|[[ Frank , Dar...| Drama| Columbia Pictures |[The Shwashank Re...|1994.0| R|
|[[ Ramesh , Sip...| Action-Advneture|United Producers ...| [Sholay, 204]|1975.0| PG|
|[[ Ashutosh , G...| Drama|Ashutosh Gowarike...| [Swades,]|2004.0| G|
|[[ Vijay , Anan...| Romantic Drama| Navketan Films| [Guide, 183]|1965.0| G|
+--------------------+-----------------+--------------------+--------------------+------+-------+
only showing top 5 rows
df1 = df.select("Director.Name.First","Director.Name.Last","Director.Name._highratedmovie","Genre","Studio","Title._VALUE","Title._runtime", "Year","_rating" )
df1.show(2)
+-------------+---------+---------------+-------------+-------------------+--------------------+--------+------+-------+
| First| Last|_highratedmovie| Genre| Studio| _VALUE|_runtime| Year|_rating|
+-------------+---------+---------------+-------------+-------------------+--------------------+--------+------+-------+
|Francis Ford | Coppola | null| Crime Drama |Paramount Pictures | The Godfather| 177|1972.0| R|
| Frank | Darabont| The Mist| Drama| Columbia Pictures |The Shwashank Red...| 142|1994.0| R|
+-------------+---------+---------------+-------------+-------------------+--------------------+--------+------+-------+
only showing top 2 rows