// We need to import XML package to do XML Read, Write operations
SBT:
// https://mvnrepository.com/artifact/com.databricks/spark-xml
libraryDependencies += "com.databricks" %% "spark-xml" % "0.9.0"
Maven:
<!-- https://mvnrepository.com/artifact/com.databricks/spark-xml -->
<dependency>
<groupId>com.databricks</groupId>
<artifactId>spark-xml_2.11</artifactId>
<version>0.9.0</version>
</dependency>
// come out of spark-shell
$ spark-shell --packages com.databricks:spark-xml_2.11:0.9.0
$ spark-shell --packages com.databricks:spark-xml_2.11:0.9.0
Ivy Default Cache set to: /home/cloudera/.ivy2/cache
The jars for the packages stored in: /home/cloudera/.ivy2/jars
:: loading settings :: url = jar:file:/usr/lib/spark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml
com.databricks#spark-xml_2.11 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-7f58f8bd-e82f-4378-a4ae-ffe907f5bb29;1.0
confs: [default]
found com.databricks#spark-xml_2.11;0.9.0 in central
found commons-io#commons-io;2.6 in central
found org.glassfish.jaxb#txw2;2.3.2 in central
downloading https://repo1.maven.org/maven2/com/databricks/spark-xml_2.11/0.9.0/spark-xml_2.11-0.9.0.jar ...
[SUCCESSFUL ] com.databricks#spark-xml_2.11;0.9.0!spark-xml_2.11.jar (488ms)
downloading https://repo1.maven.org/maven2/commons-io/commons-io/2.6/commons-io-2.6.jar ...
[SUCCESSFUL ] commons-io#commons-io;2.6!commons-io.jar (216ms)
downloading https://repo1.maven.org/maven2/org/glassfish/jaxb/txw2/2.3.2/txw2-2.3.2.jar ...
[SUCCESSFUL ] org.glassfish.jaxb#txw2;2.3.2!txw2.jar (175ms)
:: resolution report :: resolve 10552ms :: artifacts dl 897ms
:: modules in use:
com.databricks#spark-xml_2.11;0.9.0 from central in [default]
commons-io#commons-io;2.6 from central in [default]
org.glassfish.jaxb#txw2;2.3.2 from central in [default]
---------------------------------------------------------------------
| | modules || artifacts |
| conf | number| search|dwnlded|evicted|| number|dwnlded|
---------------------------------------------------------------------
| default | 3 | 3 | 3 | 0 || 3 | 3 |
---------------------------------------------------------------------
:: retrieving :: org.apache.spark#spark-submit-parent-7f58f8bd-e82f-4378-a4ae-ffe907f5bb29
confs: [default]
3 artifacts copied, 0 already retrieved (523kB/21ms)
scala> import com.databricks.spark.xml._
import com.databricks.spark.xml._
// load ohm.products into dataframe
scala> val df = spark.sql("select * from ohm.products")
df: org.apache.spark.sql.DataFrame = [product_id: int, product_category_id: int ... 4 more fields]
scala> df.count()
res0: Long = 1345
scala> df.printSchema()
root
|-- product_id: integer (nullable = true)
|-- product_category_id: integer (nullable = true)
|-- product_name: string (nullable = true)
|-- product_description: string (nullable = true)
|-- product_price: float (nullable = true)
|-- product_image: string (nullable = true)
// Create xml file in spark with all the datas available in Dataframe
scala> df.write.format("xml").save("/home/cloudera/sparkoutput/prod_xml")
// display the xml content in Linux using cat
$ cat /home/cloudera/sparkoutput/prod_xml/* | tail
<ROW>
<product_id>1345</product_id>
<product_category_id>59</product_category_id>
<product_name>Nike Men's Home Game Jersey St. Louis Rams Gr</product_name>
<product_description></product_description>
<product_price>100.0</product_price>
<product_image>http://images.acmesports.sports/Nike+Men%27s+Home+Game+Jersey+St.+Louis+Rams+Greg+Robinson...</product_image>
</ROW>
// Read XML file and make Dataframe
scala> val dfprodXml = spark.read.format("xml").load("/home/cloudera/sparkoutput/prod_xml")
dfprodXml: org.apache.spark.sql.DataFrame = [product_category_id: bigint, product_description: string ... 4 more fields]
scala> dfprodXml.printSchema()
root
|-- product_category_id: long (nullable = true)
|-- product_description: string (nullable = true)
|-- product_id: long (nullable = true)
|-- product_image: string (nullable = true)
|-- product_name: string (nullable = true)
|-- product_price: double (nullable = true)
dfprodXml.select("product_name","product_price").show()
+--------------------+-------------+
| product_name|product_price|
+--------------------+-------------+
|Quest Q64 10 FT. ...| 59.98|
|Under Armour Men'...| 129.99|
|Under Armour Men'...| 89.99|
|Under Armour Men'...| 89.99|
|Riddell Youth Rev...| 199.99|
|Jordan Men's VI R...| 134.99|
|Schutt Youth Recr...| 99.99|
|Nike Men's Vapor ...| 129.99|
|Nike Adult Vapor ...| 50.0|
|Under Armour Men'...| 129.99|
|Fitness Gear 300 ...| 209.99|
|Under Armour Men'...| 139.99|
|Under Armour Men'...| 89.99|
|Quik Shade Summit...| 199.99|
|Under Armour Kids...| 59.99|
|Riddell Youth 360...| 299.99|
|Under Armour Men'...| 129.99|
|Reebok Men's Full...| 29.97|
|Nike Men's Finger...| 124.99|
|Under Armour Men'...| 129.99|
+--------------------+-------------+
only showing top 20 rows
No comments:
Post a Comment