Sankara's Big Data Notes: solution

Showing posts with label solution. Show all posts

Saturday, 15 August 2020

FIFA 2019 DataSet Analytics with Spark and Scala

// FIFA 2019 Dataset Analytics with Spark and Scala

scala> val df = spark.read.format("csv").option("header","True").option("inferSchema","True").load("D:\\Ex\\FIFA\\data.csv")

df: org.apache.spark.sql.DataFrame = [ID: int, Name: int ... 86 more fields]

scala> df.printSchema()

root

|-- _c0: integer (nullable = true)

|-- ID: integer (nullable = true)

|-- Name: integer (nullable = true)

|-- Age: string (nullable = true)

|-- Photo: integer (nullable = true)

|-- Nationality: string (nullable = true)

|-- Flag: string (nullable = true)

|-- Overall: string (nullable = true)

|-- Potential: integer (nullable = true)

|-- Club: integer (nullable = true)

|-- Club Logo: string (nullable = true)

|-- Value: string (nullable = true)

|-- Wage: string (nullable = true)

|-- Special: string (nullable = true)

|-- Preferred Foot: integer (nullable = true)

|-- International Reputation: string (nullable = true)

|-- Weak Foot: integer (nullable = true)

|-- Skill Moves: integer (nullable = true)

|-- Work Rate: integer (nullable = true)

|-- Body Type: string (nullable = true)

|-- Real Face: string (nullable = true)

|-- Position: string (nullable = true)

|-- Jersey Number: string (nullable = true)

|-- Joined: integer (nullable = true)

|-- Loaned From: string (nullable = true)

|-- Contract Valid Until: string (nullable = true)

|-- Height: string (nullable = true)

|-- Weight: string (nullable = true)

|-- LS: string (nullable = true)

|-- ST: string (nullable = true)

|-- RS: string (nullable = true)

|-- LW: string (nullable = true)

|-- LF: string (nullable = true)

|-- CF: string (nullable = true)

|-- RF: string (nullable = true)

|-- RW: string (nullable = true)

|-- LAM: string (nullable = true)

|-- CAM: string (nullable = true)

|-- RAM: string (nullable = true)

|-- LM: string (nullable = true)

|-- LCM: string (nullable = true)

|-- CM: string (nullable = true)

|-- RCM: string (nullable = true)

|-- RM: string (nullable = true)

|-- LWB: string (nullable = true)

|-- LDM: string (nullable = true)

|-- CDM: string (nullable = true)

|-- RDM: string (nullable = true)

|-- RWB: string (nullable = true)

|-- LB: string (nullable = true)

|-- LCB: string (nullable = true)

|-- CB: string (nullable = true)

|-- RCB: string (nullable = true)

|-- RB: string (nullable = true)

|-- Crossing: string (nullable = true)

|-- Finishing: integer (nullable = true)

|-- HeadingAccuracy: integer (nullable = true)

|-- ShortPassing: integer (nullable = true)

|-- Volleys: integer (nullable = true)

|-- Dribbling: integer (nullable = true)

|-- Curve: integer (nullable = true)

|-- FKAccuracy: integer (nullable = true)

|-- LongPassing: integer (nullable = true)

|-- BallControl: integer (nullable = true)

|-- Acceleration: integer (nullable = true)

|-- SprintSpeed: integer (nullable = true)

|-- Agility: integer (nullable = true)

|-- Reactions: integer (nullable = true)

|-- Balance: integer (nullable = true)

|-- ShotPower: integer (nullable = true)

|-- Jumping: integer (nullable = true)

|-- Stamina: integer (nullable = true)

|-- Strength: integer (nullable = true)

|-- LongShots: integer (nullable = true)

|-- Aggression: integer (nullable = true)

|-- Interceptions: integer (nullable = true)

|-- Positioning: integer (nullable = true)

|-- Vision: integer (nullable = true)

|-- Penalties: integer (nullable = true)

|-- Composure: integer (nullable = true)

|-- Marking: integer (nullable = true)

|-- StandingTackle: integer (nullable = true)

|-- SlidingTackle: integer (nullable = true)

|-- GKDiving: integer (nullable = true)

|-- GKHandling: integer (nullable = true)

|-- GKKicking: integer (nullable = true)

|-- GKPositioning: integer (nullable = true)

|-- GKReflexes: integer (nullable = true)

|-- Release Clause: integer (nullable = true)

// Total Row Count

scala> df.count()

res1: Long = 18207

// Display all the column names

scala> df.columns.foreach(println)

_c0

Name

Age

Photo

Nationality

Flag

Overall

Potential

Club

Club Logo

Value

Wage

Special

Preferred Foot

International Reputation

Weak Foot

Skill Moves

Work Rate

Body Type

Real Face

Position

Jersey Number

Joined

Loaned From

Contract Valid Until

Height

Weight

LAM

CAM

RAM

LCM

RCM

LWB

LDM

CDM

RDM

RWB

LCB

RCB

Crossing

Finishing

HeadingAccuracy

ShortPassing

Volleys

Dribbling

Curve

FKAccuracy

LongPassing

BallControl

Acceleration

SprintSpeed

Agility

Reactions

Balance

ShotPower

Jumping

Stamina

Strength

LongShots

Aggression

Interceptions

Positioning

Vision

Penalties

Composure

Marking

StandingTackle

SlidingTackle

GKDiving

GKHandling

GKKicking

GKPositioning

GKReflexes

Release Clause

// Total number of columns

scala> df.columns.length

res4: Int = 88

scala> df.columns.size

res5: Int = 88

scala> df.describe("Value").show()

+-------+-----+

|summary|Value|

+-------+-----+

| count|18207|

| mean| null|

| stddev| null|

| min| ?0|

| max| ?9M|

+-------+-----+

// Display Name and Nationality

scala> df.select("Name","Nationality").show(25)

+-----------------+-----------+

| Name|Nationality|

+-----------------+-----------+

| L. Messi| Argentina|

|Cristiano Ronaldo| Portugal|

| Neymar Jr| Brazil|

| De Gea| Spain|

| K. De Bruyne| Belgium|

| E. Hazard| Belgium|

| L. Modri?| Croatia|

| L. Suárez| Uruguay|

| Sergio Ramos| Spain|

| J. Oblak| Slovenia|

| R. Lewandowski| Poland|

| T. Kroos| Germany|

| D. Godín| Uruguay|

| David Silva| Spain|

| N. Kanté| France|

| P. Dybala| Argentina|

| H. Kane| England|

| A. Griezmann| France|

| M. ter Stegen| Germany|

| T. Courtois| Belgium|

| Sergio Busquets| Spain|

| E. Cavani| Uruguay|

| M. Neuer| Germany|

| S. Agüero| Argentina|

| G. Chiellini| Italy|

+-----------------+-----------+

only showing top 25 rows

// Display the total players count for each Nationality

scala> df.groupBy("Nationality").agg(count("Name").alias("Total Player")).show(2000)

+--------------------+------------+

| Nationality|Total Player|

+--------------------+-----------+

| Chad| 2|

| Russia| 79|

| Paraguay| 85|

| Senegal| 130|

| Sweden| 397|

| Guyana| 3|

| Philippines| 2|

| Eritrea| 2|

| Fiji| 1|

| Turkey| 303|

| Iraq| 7|

| Germany| 1198|

| St Kitts Nevis| 3|

| Comoros| 6|

| Afghanistan| 4|

| Ivory Coast| 100|

| Jordan| 1|

| Rwanda| 1|

| Sudan| 3|

| France| 914|

| Greece| 102|

| Kosovo| 33|

|Central African Rep.| 3|

| DR Congo| 52|

| Montserrat| 4|

| Algeria| 60|

| Togo| 12|

| Equatorial Guinea| 5|

| Slovakia| 54|

| Argentina| 937|

| Wales| 129|

| Belgium| 260|

| Angola| 15|

| St Lucia| 1|

| Ecuador| 43|

| Qatar| 1|

| Albania| 40|

| Madagascar| 12|

| Finland| 67|

| New Caledonia| 1|

| Ghana| 114|

| Nicaragua| 2|

| Peru| 37|

| Benin| 15|

| Sierra Leone| 6|

| United States| 353|

| Curacao| 14|

| India| 30|

| Belarus| 4|

| Kuwait| 1|

| Malta| 1|

| Chile| 391|

| Puerto Rico| 1|

| Croatia| 126|

| Burundi| 3|

| Nigeria| 121|

| Bolivia| 30|

| Andorra| 1|

| Gabon| 15|

| Korea Republic| 335|

| Italy| 702|

| Suriname| 4|

| Lithuania| 8|

| Norway| 341|

| Spain| 1072|

| Cuba| 4|

| Mauritania| 4|

| Denmark| 336|

| Niger| 3|

| Barbados| 3|

| Iran| 17|

| Congo| 25|

| Liechtenstein| 3|

| Thailand| 5|

| Morocco| 85|

| Cape Verde| 19|

| Panama| 15|

| Hong Kong| 2|

| Korea DPR| 4|

| Ukraine| 73|

| Venezuela| 67|

| Iceland| 47|

| Israel| 14|

| Bosnia Herzegovina| 61|

| Oman| 1|

| Cyprus| 8|

| Palestine| 1|

| Uruguay| 149|

| Mexico| 366|

| FYR Macedonia| 20|

| Montenegro| 23|

| Zimbabwe| 13|

| Estonia| 13|

| Georgia| 26|

| Indonesia| 1|

| Guatemala| 3|

| Guam| 1|

| Libya| 4|

| Azerbaijan| 5|

| Grenada| 1|

| Armenia| 10|

| Tunisia| 32|

| Liberia| 1|

| Syria| 9|

| Honduras| 16|

| Saudi Arabia| 340|

| Uganda| 6|

| Namibia| 3|

| Switzerland| 220|

| Zambia| 9|

| Ethiopia| 1|

| Jamaica| 32|

| Latvia| 6|

|United Arab Emirates| 1|

| South Sudan| 1|

| Guinea| 31|

| Canada| 64|

| Uzbekistan| 2|

| Faroe Islands| 6|

| Czech Republic| 100|

| Mozambique| 4|

| Brazil| 827|

| Belize| 1|

| Kenya| 10|

| Gambia| 15|

| Lebanon| 1|

| Slovenia| 55|

| Dominican Republic| 2|

| Japan| 478|

| Tanzania| 3|

| Botswana| 1|

| Republic of Ireland| 368|

| Luxembourg| 8|

| New Zealand| 44|

| England| 1662|

| Trinidad & Tobago| 4|

| China PR| 392|

| Haiti| 10|

| Poland| 350|

| Portugal| 322|

| Cameroon| 90|

| Australia| 236|

| Romania| 54|

| Bulgaria| 32|

| Austria| 298|

| Egypt| 31|

| Costa Rica| 30|

| El Salvador| 5|

| Kazakhstan| 4|

| Serbia| 126|

| Burkina Faso| 16|

| South Africa| 71|

| Bermuda| 2|

| Scotland| 286|

| Colombia| 618|

| Northern Ireland| 80|

| Hungary| 38|

| Guinea Bissau| 15|

| Antigua & Barbuda| 4|

| Mauritius| 1|

| Moldova| 5|

| Netherlands| 453|

| Mali| 43|

| S?o Tomé & Príncipe| 1|

+--------------------+-----------+

// Top 20 Nationalities in ASCENDING order

scala> df.groupBy("Nationality").agg(count("Name").alias("Total Player")).orderBy("Nationality").show(

+------------------+------------+

| Nationality|Total Player|

+------------------+------------+

| Afghanistan| 4|

| Albania| 40|

| Algeria| 60|

| Andorra| 1|

| Angola| 15|

| Antigua & Barbuda| 4|

| Argentina| 937|

| Armenia| 10|

| Australia| 236|

| Austria| 298|

| Azerbaijan| 5|

| Barbados| 3|

| Belarus| 4|

| Belgium| 260|

| Belize| 1|

| Benin| 15|

| Bermuda| 2|

| Bolivia| 30|

|Bosnia Herzegovina| 61|

| Botswana| 1|

+------------------+------------+

// Nationality wise descending order

scala> df.groupBy("Nationality").agg(count("Name").alias("Total Player")).orderBy(desc("Nationality")).show(20)

+--------------------+------------+

| Nationality|Total Player|

+--------------------+------------+

| Zimbabwe| 13|

| Zambia| 9|

| Wales| 129|

| Venezuela| 67|

| Uzbekistan| 2|

| Uruguay| 149|

| United States| 353|

|United Arab Emirates| 1|

| Ukraine| 73|

| Uganda| 6|

| Turkey| 303|

| Tunisia| 32|

| Trinidad & Tobago| 4|

| Togo| 12|

| Thailand| 5|

| Tanzania| 3|

| S?o Tomé & Príncipe| 1|

| Syria| 9|

| Switzerland| 220|

| Sweden| 397|

+--------------------+------------+

only showing top 20 rows

// Total Players for each Nationality in ASCending Order

scala> df.groupBy("Nationality").agg(count("Name").alias("TotalPlayer")).orderBy(desc("TotalPlayer")).show(20)

+-------------------+-----------+

| Nationality|TotalPlayer|

+-------------------+-----------+

| England| 1662|

| Germany| 1198|

| Spain| 1072|

| Argentina| 937|

| France| 914|

| Brazil| 827|

| Italy| 702|

| Colombia| 618|

| Japan| 478|

| Netherlands| 453|

| Sweden| 397|

| China PR| 392|

| Chile| 391|

|Republic of Ireland| 368|

| Mexico| 366|

| United States| 353|

| Poland| 350|

| Norway| 341|

| Saudi Arabia| 340|

| Denmark| 336|

+-------------------+-----------+

only showing top 20 rows

// Display Player Name, Nationality, Club informations

scala> df.select("Name","Nationality","Club").show()

+-----------------+-----------+-------------------+

| Name|Nationality| Club|

+-----------------+-----------+-------------------+

| L. Messi| Argentina| FC Barcelona|

|Cristiano Ronaldo| Portugal| Juventus|

| Neymar Jr| Brazil|Paris Saint-Germain|

| De Gea| Spain| Manchester United|

| K. De Bruyne| Belgium| Manchester City|

| E. Hazard| Belgium| Chelsea|

| L. Modri?| Croatia| Real Madrid|

| L. Suárez| Uruguay| FC Barcelona|

| Sergio Ramos| Spain| Real Madrid|

| J. Oblak| Slovenia| Atlético Madrid|

| R. Lewandowski| Poland| FC Bayern München|

| T. Kroos| Germany| Real Madrid|

| D. Godín| Uruguay| Atlético Madrid|

| David Silva| Spain| Manchester City|

| N. Kanté| France| Chelsea|

| P. Dybala| Argentina| Juventus|

| H. Kane| England| Tottenham Hotspur|

| A. Griezmann| France| Atlético Madrid|

| M. ter Stegen| Germany| FC Barcelona|

| T. Courtois| Belgium| Real Madrid|

+-----------------+-----------+-------------------+

only showing top 20 rows

// Age is Lesser than 30

scala> df.select("ID","Name","Age","Nationality","Potential","Jersey Number","Club").filter("Age < 30").show()

+------+---------------+---+-----------+---------+-------------+-------------------+

+------+---------------+---+-----------+---------+-------------+-------------------+

|190871| Neymar Jr| 26| Brazil| 93| 10|Paris Saint-Germain|

|193080| De Gea| 27| Spain| 93| 1| Manchester United|

|192985| K. De Bruyne| 27| Belgium| 92| 7| Manchester City|

|183277| E. Hazard| 27| Belgium| 91| 10| Chelsea|

|200389| J. Oblak| 25| Slovenia| 93| 1| Atlético Madrid|

|188545| R. Lewandowski| 29| Poland| 90| 9| FC Bayern München|

|182521| T. Kroos| 28| Germany| 90| 8| Real Madrid|

|215914| N. Kanté| 27| France| 90| 13| Chelsea|

|211110| P. Dybala| 24| Argentina| 94| 21| Juventus|

|202126| H. Kane| 24| England| 91| 9| Tottenham Hotspur|

|194765| A. Griezmann| 27| France| 90| 7| Atlético Madrid|

|192448| M. ter Stegen| 26| Germany| 92| 22| FC Barcelona|

|192119| T. Courtois| 26| Belgium| 90| 1| Real Madrid|

|189511|Sergio Busquets| 29| Spain| 89| 5| FC Barcelona|

|231747| K. Mbappé| 19| France| 95| 10|Paris Saint-Germain|

|209331| M. Salah| 26| Egypt| 89| 10| Liverpool|

|200145| Casemiro| 26| Brazil| 90| 14| Real Madrid|

|198710| J. Rodríguez| 26| Colombia| 89| 10| FC Bayern München|

|198219| L. Insigne| 27| Italy| 88| 10| Napoli|

|197781| Isco| 26| Spain| 91| 22| Real Madrid|

+------+---------------+---+-----------+---------+-------------+-------------------+

only showing top 20 rows

Thursday, 13 August 2020

Google Adsense Analytics using Spark with Scala

$ hdfs dfs -cat hdfs://localhost:8020/user/cloudera/adsense/adsense.csv | head

Date,Page views,Impressions,Clicks,Page RPM (USD),Impression RPM (USD),Active View Viewable,Estimated earnings (USD)

2010-02-25,61358,160401,525,1.04,0.4,,63.71

2011-07-03,60066,157661,774,0.62,0.24,,37.19

2011-04-03,59079,148407,339,0.5,0.2,,29.54

2011-07-04,55221,142543,802,0.88,0.34,,48.58

2011-11-21,50808,136381,786,1.04,0.39,,52.73

2011-11-28,48360,130869,996,1.12,0.41,,54.14

2011-07-08,49545,129778,806,0.94,0.36,,46.45

2011-11-20,49020,129486,622,0.88,0.33,,42.98

2010-12-18,45092,127493,289,1.19,0.42,,53.77

scala> val df = spark.read.format("csv").option("header","True").option("inferSchema","True").load("hdfs://localhost:8020/user/cloudera/adsense/adsense.csv")

scala> df.printSchema()

root

|-- Date: timestamp (nullable = true)

|-- Page views: integer (nullable = true)

|-- Impressions: integer (nullable = true)

|-- Clicks: integer (nullable = true)

|-- Page RPM (USD): double (nullable = true)

|-- Impression RPM (USD): double (nullable = true)

|-- Active View Viewable: string (nullable = true)

|-- Estimated earnings (USD): double (nullable = true)

val dfAdsense = df.withColumnRenamed("Date","fldDate").withColumnRenamed("Page views","fldPageViews").withColumnRenamed("Impressions","fldImpressions").withColumnRenamed("Clicks","fldClicks").withColumnRenamed("Page RPM (USD)","fldPageRPM").withColumnRenamed("Impression RPM (USD)","fldimpressionRPM").withColumnRenamed("Active View Viewable","fldActiveViewViewable").withColumnRenamed("Estimated earnings (USD)","fldEstimatedEarnings")

scala> dfAdsense.printSchema()

root

|-- fldDate: timestamp (nullable = true)

|-- fldPageViews: integer (nullable = true)

|-- fldImpressions: integer (nullable = true)

|-- fldClicks: integer (nullable = true)

|-- fldPageRPM: double (nullable = true)

|-- fldimpressionRPM: double (nullable = true)

|-- fldActiveViewViewable: string (nullable = true)

|-- fldEstimatedEarnings: double (nullable = true)

// total row count before remove 0 earnings records

scala> dfAdsense.count

res3: Long = 2452

scala> val df = dfAdsense.filter($"fldEstimatedEarnings" =!= 0)

df: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [fldDate: timestamp, fldPageViews: int ... 6 more fields]

scala> print(df.count())

1511

scala> df.agg(sum("fldEstimatedEarnings")).show()

+-------------------------+

|sum(fldEstimatedEarnings)|

+-------------------------+

| 16995.739999999976|

scala> df.select("fldDate","fldEstimatedEarnings").sort("fldEstimatedEarnings").limit(10).show()

+-------------------+--------------------+

| fldDate|fldEstimatedEarnings|

+-------------------+--------------------+

|2008-09-08 00:00:00| 0.01|

|2016-02-22 00:00:00| 0.01|

|2008-09-03 00:00:00| 0.01|

|2009-03-07 00:00:00| 0.01|

|2008-09-05 00:00:00| 0.01|

|2011-12-10 00:00:00| 0.01|

|2011-12-06 00:00:00| 0.01|

|2011-12-13 00:00:00| 0.01|

|2008-09-14 00:00:00| 0.01|

|2008-08-30 00:00:00| 0.01|

+-------------------+--------------------+

scala> df.select("fldDate","fldEstimatedEarnings").sort(desc("fldEstimatedEarnings")).limit(10).show()

+-------------------+--------------------+

| fldDate|fldEstimatedEarnings|

+-------------------+--------------------+

|2010-02-25 00:00:00| 63.71|

|2011-11-28 00:00:00| 54.14|

|2010-12-18 00:00:00| 53.77|

|2011-11-21 00:00:00| 52.73|

|2010-12-19 00:00:00| 52.14|

|2011-07-13 00:00:00| 50.86|

|2011-12-02 00:00:00| 49.63|

|2011-06-24 00:00:00| 48.91|

|2011-09-27 00:00:00| 48.8|

|2011-07-04 00:00:00| 48.58|

+-------------------+--------------------+

scala> df.select("fldDate","fldEstimatedEarnings", "fldClicks","fldPageViews").sort(desc("fldEstimatedEarnings")).limit(10).show()

+-------------------+--------------------+---------+------------+

+-------------------+--------------------+---------+------------+

|2010-02-25 00:00:00| 63.71| 525| 61358|

|2011-11-28 00:00:00| 54.14| 996| 48360|

|2010-12-18 00:00:00| 53.77| 289| 45092|

|2011-11-21 00:00:00| 52.73| 786| 50808|

|2010-12-19 00:00:00| 52.14| 202| 35461|

|2011-07-13 00:00:00| 50.86| 751| 40785|

|2011-12-02 00:00:00| 49.63| 842| 42972|

|2011-06-24 00:00:00| 48.91| 704| 33192|

|2011-09-27 00:00:00| 48.8| 680| 34525|

|2011-07-04 00:00:00| 48.58| 802| 55221|

+-------------------+--------------------+---------+------------+

scala> df.groupBy(year($"fldDate")).agg(sum($"fldEstimatedEarnings")).show()

+-------------+-------------------------+

|year(fldDate)|sum(fldEstimatedEarnings)|

+-------------+-------------------------+

| 2015| 0.21000000000000002|

| 2013| 1.2799999999999998|

| 2014| 0.03|

| 2012| 2702.84|

| 2009| 1640.6799999999964|

| 2016| 0.25|

| 2010| 2883.2799999999975|

| 2011| 9392.38|

| 2008| 374.7300000000001|

| 2017| 0.060000000000000005|

+-------------+-------------------------+

scala> df.groupBy(year($"fldDate")).agg(round(sum($"fldEstimatedEarnings"),2)).show()

+-------------+-----------------------------------+

|year(fldDate)|round(sum(fldEstimatedEarnings), 2)|

+-------------+-----------------------------------+

| 2015| 0.21|

| 2013| 1.28|

| 2014| 0.03|

| 2012| 2702.84|

| 2009| 1640.68|

| 2016| 0.25|

| 2010| 2883.28|

| 2011| 9392.38|

| 2008| 374.73|

| 2017| 0.06|

+-------------+-----------------------------------+

scala> df.groupBy(year($"fldDate").alias("Year")).agg(round(sum($"fldEstimatedEarnings"),2).alias("Earnings")).show()

+----+--------+

|Year|Earnings|

+----+--------+

|2015| 0.21|

|2013| 1.28|

|2014| 0.03|

|2012| 2702.84|

|2009| 1640.68|

|2016| 0.25|

|2010| 2883.28|

|2011| 9392.38|

|2008| 374.73|

|2017| 0.06|

+----+--------+

scala> df.groupBy(year($"fldDate").alias("Year"), month($"fldDate").alias("Month")).agg(round(sum($"fldEstimatedEarnings"),2).alias("Earnings")).sort("Year","Month").show(200)

+----+-----+--------+

|Year|Month|Earnings|

+----+-----+--------+

|2008| 6| 35.76|

|2008| 7| 82.05|

|2008| 8| 27.17|

|2008| 9| 21.12|

|2008| 10| 65.87|

|2008| 11| 78.38|

|2008| 12| 64.38|

|2009| 1| 130.1|

|2009| 2| 26.69|

|2009| 3| 59.68|

|2009| 4| 94.23|

|2009| 5| 186.15|

|2009| 6| 248.88|

|2009| 7| 190.95|

|2009| 8| 81.7|

|2009| 9| 60.99|

|2009| 10| 206.2|

|2009| 11| 198.68|

|2009| 12| 156.43|

|2010| 1| 101.46|

|2010| 2| 316.61|

|2010| 3| 179.72|

|2010| 4| 142.26|

|2010| 5| 120.05|

|2010| 6| 132.18|

|2010| 7| 134.75|

|2010| 8| 137.77|

|2010| 9| 265.05|

|2010| 10| 267.31|

|2010| 11| 372.2|

|2010| 12| 713.92|

|2011| 1| 478.87|

|2011| 2| 551.54|

|2011| 3| 549.9|

|2011| 4| 683.07|

|2011| 5| 620.94|

|2011| 6| 969.76|

|2011| 7| 1149.31|

|2011| 8| 1057.64|

|2011| 9| 1039.34|

|2011| 10| 966.37|

|2011| 11| 1138.69|

|2011| 12| 186.95|

|2012| 1| 0.48|

|2012| 2| 256.25|

|2012| 3| 564.13|

|2012| 4| 524.13|

|2012| 5| 504.83|

|2012| 6| 370.12|

|2012| 7| 261.1|

|2012| 8| 219.39|

|2012| 9| 2.38|

|2012| 10| 0.02|

|2012| 12| 0.01|

|2013| 1| 0.01|

|2013| 3| 0.25|

|2013| 4| 0.11|

|2013| 5| 0.09|

|2013| 6| 0.25|

|2013| 7| 0.03|

|2013| 9| 0.09|

|2013| 10| 0.32|

|2013| 12| 0.13|

|2014| 2| 0.03|

|2015| 2| 0.08|

|2015| 4| 0.03|

|2015| 7| 0.1|

|2016| 2| 0.01|

|2016| 3| 0.2|

|2016| 9| 0.03|

|2016| 11| 0.01|

|2017| 6| 0.06|

+----+-----+--------+

// order by Earnings group by Year and Month

scala> df.groupBy(year($"fldDate").alias("Year"), month($"fldDate").alias("Month")).agg(round(sum($"fldEstimatedEarnings"),2).alias("Earnings")).sort(desc("Earnings")).show(200)

+----+-----+--------+

|Year|Month|Earnings|

+----+-----+--------+

|2011| 7| 1149.31|

|2011| 11| 1138.69|

|2011| 8| 1057.64|

|2011| 9| 1039.34|

|2011| 6| 969.76|

|2011| 10| 966.37|

|2010| 12| 713.92|

|2011| 4| 683.07|

|2011| 5| 620.94|

|2012| 3| 564.13|

|2011| 2| 551.54|

|2011| 3| 549.9|

|2012| 4| 524.13|

|2012| 5| 504.83|

|2011| 1| 478.87|

|2010| 11| 372.2|

|2012| 6| 370.12|

|2010| 2| 316.61|

|2010| 10| 267.31|

|2010| 9| 265.05|

|2012| 7| 261.1|

|2012| 2| 256.25|

|2009| 6| 248.88|

|2012| 8| 219.39|

|2009| 10| 206.2|

|2009| 11| 198.68|

|2009| 7| 190.95|

|2011| 12| 186.95|

|2009| 5| 186.15|

|2010| 3| 179.72|

|2009| 12| 156.43|

|2010| 4| 142.26|

|2010| 8| 137.77|

|2010| 7| 134.75|

|2010| 6| 132.18|

|2009| 1| 130.1|

|2010| 5| 120.05|

|2010| 1| 101.46|

|2009| 4| 94.23|

|2008| 7| 82.05|

|2009| 8| 81.7|

|2008| 11| 78.38|

|2008| 10| 65.87|

|2008| 12| 64.38|

|2009| 9| 60.99|

|2009| 3| 59.68|

|2008| 6| 35.76|

|2008| 8| 27.17|

|2009| 2| 26.69|

|2008| 9| 21.12|

|2012| 9| 2.38|

|2012| 1| 0.48|

|2013| 10| 0.32|

|2013| 3| 0.25|

|2013| 6| 0.25|

|2016| 3| 0.2|

|2013| 12| 0.13|

|2013| 4| 0.11|

|2015| 7| 0.1|

|2013| 5| 0.09|

|2013| 9| 0.09|

|2015| 2| 0.08|

|2017| 6| 0.06|

|2013| 7| 0.03|

|2014| 2| 0.03|

|2015| 4| 0.03|

|2016| 9| 0.03|

|2012| 10| 0.02|

|2016| 11| 0.01|

|2012| 12| 0.01|

|2016| 2| 0.01|

|2013| 1| 0.01|

+----+-----+--------+

Monday, 3 August 2020

SQOOP Import Problem and Solution #3

#Problem : 2

Connect to MySQL database using SQOOP,

import all customers whose street name contains "Plaza"

Ex:

Hazy Mountain Plaza

Tawny Fox Plaza

Data description:

A MySQL instance is running on the localhost node.

In that instance, you will find customers table that contains

customer's data.

Installation : localhost

Database name : retail_db

Table Name : Customers

User Name : root

Password : cloudera

Output requirement:

Place the customers files in HDFS directory

"user/cloudera/problem1/customers/txtdata"

Save output in text format with fields

separated by a "*" and lines should be terminated by pipe.

Select the columns : customer_id,

customer_fname,customer_lname,

customer_street_name.

$ sqoop import \

-connect "jdbc:mysql://localhost/retail_db" \

-username root \

-password cloudera \

-table customers \

-where "customer_street like '%Plaza%'" \

-target-dir /user/cloudera/problem1/customers/textdata \

-columns "customer_id,customer_fname,customer_lname,customer_street"

dfs dfs -ls /user/cloudera/problem1/customers/textdata

Found 5 items

-rw-r--r-- 1 cloudera cloudera 0 2020-08-03 08:40 /user/cloudera/problem1/customers/textdata/_SUCCESS

-rw-r--r-- 1 cloudera cloudera 974 2020-08-03 08:40 /user/cloudera/problem1/customers/textdata/part-m-00000

-rw-r--r-- 1 cloudera cloudera 1165 2020-08-03 08:40 /user/cloudera/problem1/customers/textdata/part-m-00001

-rw-r--r-- 1 cloudera cloudera 986 2020-08-03 08:40 /user/cloudera/problem1/customers/textdata/part-m-00002

-rw-r--r-- 1 cloudera cloudera 1028 2020-08-03 08:40 /user/cloudera/problem1/customers/textdata/part-m-00003

$ hdfs dfs -cat /user/cloudera/problem1/customers/textdata/part-m-00000 | tail

1793,Mary,Burke,141 Dewy Plaza

1821,Mary,Trevino,6520 Quaking Quail Plaza

1891,Mary,Mora,1804 Misty Plaza

2053,Ronald,Smith,1025 Colonial Plaza

2247,Alice,Werner,1557 Heather Leaf Plaza

2517,Jose,Kane,5202 Emerald Island Plaza

2545,Larry,Patel,1688 Middle Panda Plaza

2575,Mary,Carroll,4979 Honey Plaza

2831,Mary,Padilla,3810 Crystal Plaza

2875,Mary,Roberts,2205 Velvet Plaza

SQOOP Import Problem and Solution #2

#Problem : 2

Connect to MySQL database using SQOOP,

import all customers lives in 'CA' state.

Data description:

A MySQL instance is running on the localhost node.

In that instance, you will find customers table that contains

customer's data.

Installation : localhost

Database name : retail_db

Table Name : Customers

User Name : root

Password : cloudera

Output requirement:

Place the customers files in HDFS directory

"user/cloudera/problem1/customers_selected/avrodata"

Use avro format with Snappy compression

Select the columns : customer_id,customer_fname,customer_lname,customer_state.

$ sqoop import \

-connect "jdbc:mysql://localhost/retail_db" \

-username root \

-password cloudera \

-table customers \

-where "customer_state='CA'" \

-target-dir /user/cloudera/problem1/customers_selected/avrodata \

-columns "customer_id,customer_fname,customer_lname,customer_state" \

-compress \

-compression-codec snappy \

-as-avrodatafile

$ hdfs dfs -ls /user/cloudera/problem1/customers_selected/avrodata

Found 5 items

-rw-r--r-- 1 cloudera cloudera 0 2020-08-03 08:24 /user/cloudera/problem1/customers_selected/avrodata/_SUCCESS

-rw-r--r-- 1 cloudera cloudera 6508 2020-08-03 08:24 /user/cloudera/problem1/customers_selected/avrodata/part-m-00000.avro

-rw-r--r-- 1 cloudera cloudera 6427 2020-08-03 08:24 /user/cloudera/problem1/customers_selected/avrodata/part-m-00001.avro

-rw-r--r-- 1 cloudera cloudera 6419 2020-08-03 08:24 /user/cloudera/problem1/customers_selected/avrodata/part-m-00002.avro

-rw-r--r-- 1 cloudera cloudera 7093 2020-08-03 08:24 /user/cloudera/problem1/customers_selected/avrodata/part-m-00003.avro

$ avro-tools tojson hdfs://localhost/user/cloudera/problem1/customers_selected/avrodata/part-m-00003.avro | head

log4j:WARN No appenders could be found for logger (org.apache.hadoop.metrics2.lib.MutableMetricsFactory).

log4j:WARN Please initialize the log4j system properly.

log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.

{"customer_id":{"int":9328},"customer_fname":{"string":"Mary"},"customer_lname":{"string":"Perez"},"customer_state":{"string":"CA"}}

{"customer_id":{"int":9333},"customer_fname":{"string":"Angela"},"customer_lname":{"string":"Mills"},"customer_state":{"string":"CA"}}

{"customer_id":{"int":9336},"customer_fname":{"string":"Janice"},"customer_lname":{"string":"Guzman"},"customer_state":{"string":"CA"}}

{"customer_id":{"int":9338},"customer_fname":{"string":"James"},"customer_lname":{"string":"Davis"},"customer_state":{"string":"CA"}}

{"customer_id":{"int":9340},"customer_fname":{"string":"Mary"},"customer_lname":{"string":"Smith"},"customer_state":{"string":"CA"}}

{"customer_id":{"int":9347},"customer_fname":{"string":"Mary"},"customer_lname":{"string":"Fuentes"},"customer_state":{"string":"CA"}}

{"customer_id":{"int":9352},"customer_fname":{"string":"Mary"},"customer_lname":{"string":"Lewis"},"customer_state":{"string":"CA"}}

{"customer_id":{"int":9357},"customer_fname":{"string":"Katherine"},"customer_lname":{"string":"Spence"},"customer_state":{"string":"CA"}}

{"customer_id":{"int":9363},"customer_fname":{"string":"Mary"},"customer_lname":{"string":"Simmons"},"customer_state":{"string":"CA"}}

{"customer_id":{"int":9375},"customer_fname":{"string":"George"},"customer_lname":{"string":"Smith"},"customer_state":{"string":"CA"}}

SQOOP Import problem and solution #1

SQOOP problem : #1

Connect to MySQL database using SQOOP, import all orders

from orders table whose order_status is COMPLETE.

Data Description:

A MySQL instance is running on the localhost.In that instance, you will find orders table that

contains order's data.

Installation : LocalHost

Database Name : retail_db

Table Name : Orders

User Name : root

Password : cloudera

Output requirement:

Place the order's files in HDFS directory:

/user/cloudera/problem1/orders/parquetdata

Use parquet format with tab delimiter and snappy compression/

NULL values are represented as -1 for numbers and "NA" for strings.

Solution:

$ sqoop import \

-connect "jdbc:mysql://localhost/retail_db" \

-username root \

-password cloudera \

-table orders \

-target-dir /user/cloudera/problem1/orders/parquetdata \

-delete-target-dir \

-fields-terminated-by "\t" \

-where "order_status='COMPLETE'" \

-null-string "NA" \

-null-non-string -1 \

-compress \

-compression-codec snappy \

-as-parquetfile

$ hdfs dfs -ls /user/cloudera/problem1/orders/parquetdataFound 6 items

drwxr-xr-x - cloudera cloudera 0 2020-08-03 08:09 /user/cloudera/problem1/orders/parquetdata/.metadata

drwxr-xr-x - cloudera cloudera 0 2020-08-03 08:11 /user/cloudera/problem1/orders/parquetdata/.signals

-rw-r--r-- 1 cloudera cloudera 50233 2020-08-03 08:11 /user/cloudera/problem1/orders/parquetdata/1fa590f9-281d-4d62-9650-ddfe04d7eae2.parquet

-rw-r--r-- 1 cloudera cloudera 47508 2020-08-03 08:11 /user/cloudera/problem1/orders/parquetdata/217e22ee-25b2-41a6-8bf9-f6fee4c7d0c4.parquet

-rw-r--r-- 1 cloudera cloudera 46231 2020-08-03 08:11 /user/cloudera/problem1/orders/parquetdata/826a8254-9700-4edd-afdb-b8fb58b2fd49.parquet

-rw-r--r-- 1 cloudera cloudera 46406 2020-08-03 08:11 /user/cloudera/problem1/orders/parquetdata/84355f27-d1a4-4c1b-ac9d-84fd5139a912.parquet

// View the parquet file content using parquet-tools:

[cloudera@quickstart ~]$ parquet-tools head -n5 hdfs://localhost/user/cloudera/problem1/orders/parquetdata/1fa590f9-281d-4d62-9650-ddfe04d7eae2.parquet

order_id = 51665

order_date = 1402729200000

order_customer_id = 8645

order_status = COMPLETE

order_id = 51670

order_date = 1402729200000

order_customer_id = 9322

order_status = COMPLETE

order_id = 51671

order_date = 1402729200000

order_customer_id = 9000

order_status = COMPLETE

order_id = 51673

order_date = 1402729200000

order_customer_id = 7538

order_status = COMPLETE

order_id = 51677

order_date = 1402729200000

order_customer_id = 6321

order_status = COMPLETE

Saturday, 15 August 2020

FIFA 2019 DataSet Analytics with Spark and Scala

Thursday, 13 August 2020

Google Adsense Analytics using Spark with Scala

Monday, 3 August 2020

SQOOP Import Problem and Solution #3

SQOOP Import Problem and Solution #2

SQOOP Import problem and solution #1

Flume - Simple Demo