Showing posts with label solution. Show all posts
Showing posts with label solution. Show all posts

Saturday, 15 August 2020

FIFA 2019 DataSet Analytics with Spark and Scala

// FIFA 2019 Dataset Analytics with Spark and Scala

scala> val df = spark.read.format("csv").option("header","True").option("inferSchema","True").load("D:\\Ex\\FIFA\\data.csv")
df: org.apache.spark.sql.DataFrame = [ID: int, Name: int ... 86 more fields]

scala> df.printSchema()
root
 |-- _c0: integer (nullable = true)
 |-- ID: integer (nullable = true)
 |-- Name: integer (nullable = true)
 |-- Age: string (nullable = true)
 |-- Photo: integer (nullable = true)
 |-- Nationality: string (nullable = true)
 |-- Flag: string (nullable = true)
 |-- Overall: string (nullable = true)
 |-- Potential: integer (nullable = true)
 |-- Club: integer (nullable = true)
 |-- Club Logo: string (nullable = true)
 |-- Value: string (nullable = true)
 |-- Wage: string (nullable = true)
 |-- Special: string (nullable = true)
 |-- Preferred Foot: integer (nullable = true)
 |-- International Reputation: string (nullable = true)
 |-- Weak Foot: integer (nullable = true)
 |-- Skill Moves: integer (nullable = true)
 |-- Work Rate: integer (nullable = true)
 |-- Body Type: string (nullable = true)
 |-- Real Face: string (nullable = true)
 |-- Position: string (nullable = true)
 |-- Jersey Number: string (nullable = true)
 |-- Joined: integer (nullable = true)
 |-- Loaned From: string (nullable = true)
 |-- Contract Valid Until: string (nullable = true)
 |-- Height: string (nullable = true)
 |-- Weight: string (nullable = true)
 |-- LS: string (nullable = true)
 |-- ST: string (nullable = true)
 |-- RS: string (nullable = true)
 |-- LW: string (nullable = true)
 |-- LF: string (nullable = true)
 |-- CF: string (nullable = true)
 |-- RF: string (nullable = true)
 |-- RW: string (nullable = true)
 |-- LAM: string (nullable = true)
 |-- CAM: string (nullable = true)
 |-- RAM: string (nullable = true)
 |-- LM: string (nullable = true)
 |-- LCM: string (nullable = true)
 |-- CM: string (nullable = true)
 |-- RCM: string (nullable = true)
 |-- RM: string (nullable = true)
 |-- LWB: string (nullable = true)
 |-- LDM: string (nullable = true)
 |-- CDM: string (nullable = true)
 |-- RDM: string (nullable = true)
 |-- RWB: string (nullable = true)
 |-- LB: string (nullable = true)
 |-- LCB: string (nullable = true)
 |-- CB: string (nullable = true)
 |-- RCB: string (nullable = true)
 |-- RB: string (nullable = true)
 |-- Crossing: string (nullable = true)
 |-- Finishing: integer (nullable = true)
 |-- HeadingAccuracy: integer (nullable = true)
 |-- ShortPassing: integer (nullable = true)
 |-- Volleys: integer (nullable = true)
 |-- Dribbling: integer (nullable = true)
 |-- Curve: integer (nullable = true)
 |-- FKAccuracy: integer (nullable = true)
 |-- LongPassing: integer (nullable = true)
 |-- BallControl: integer (nullable = true)
 |-- Acceleration: integer (nullable = true)
 |-- SprintSpeed: integer (nullable = true)
 |-- Agility: integer (nullable = true)
 |-- Reactions: integer (nullable = true)
 |-- Balance: integer (nullable = true)
 |-- ShotPower: integer (nullable = true)
 |-- Jumping: integer (nullable = true)
 |-- Stamina: integer (nullable = true)
 |-- Strength: integer (nullable = true)
 |-- LongShots: integer (nullable = true)
 |-- Aggression: integer (nullable = true)
 |-- Interceptions: integer (nullable = true)
 |-- Positioning: integer (nullable = true)
 |-- Vision: integer (nullable = true)
 |-- Penalties: integer (nullable = true)
 |-- Composure: integer (nullable = true)
 |-- Marking: integer (nullable = true)
 |-- StandingTackle: integer (nullable = true)
 |-- SlidingTackle: integer (nullable = true)
 |-- GKDiving: integer (nullable = true)
 |-- GKHandling: integer (nullable = true)
 |-- GKKicking: integer (nullable = true)
 |-- GKPositioning: integer (nullable = true)
 |-- GKReflexes: integer (nullable = true)
 |-- Release Clause: integer (nullable = true)
 
// Total Row Count
scala> df.count()
res1: Long = 18207


// Display all the column names 
scala> df.columns.foreach(println)
_c0
ID
Name
Age
Photo
Nationality
Flag
Overall
Potential
Club
Club Logo
Value
Wage
Special
Preferred Foot
International Reputation
Weak Foot
Skill Moves
Work Rate
Body Type
Real Face
Position
Jersey Number
Joined
Loaned From
Contract Valid Until
Height
Weight
LS
ST
RS
LW
LF
CF
RF
RW
LAM
CAM
RAM
LM
LCM
CM
RCM
RM
LWB
LDM
CDM
RDM
RWB
LB
LCB
CB
RCB
RB
Crossing
Finishing
HeadingAccuracy
ShortPassing
Volleys
Dribbling
Curve
FKAccuracy
LongPassing
BallControl
Acceleration
SprintSpeed
Agility
Reactions
Balance
ShotPower
Jumping
Stamina
Strength
LongShots
Aggression
Interceptions
Positioning
Vision
Penalties
Composure
Marking
StandingTackle
SlidingTackle
GKDiving
GKHandling
GKKicking
GKPositioning
GKReflexes
Release Clause


// Total number of columns
scala> df.columns.length
res4: Int = 88

scala> df.columns.size
res5: Int = 88



scala> df.describe("Value").show()
+-------+-----+
|summary|Value|
+-------+-----+
|  count|18207|
|   mean| null|
| stddev| null|
|    min|   ?0|
|    max|  ?9M|
+-------+-----+

// Display Name and Nationality
scala> df.select("Name","Nationality").show(25)
+-----------------+-----------+
|             Name|Nationality|
+-----------------+-----------+
|         L. Messi|  Argentina|
|Cristiano Ronaldo|   Portugal|
|        Neymar Jr|     Brazil|
|           De Gea|      Spain|
|     K. De Bruyne|    Belgium|
|        E. Hazard|    Belgium|
|        L. Modri?|    Croatia|
|        L. Suárez|    Uruguay|
|     Sergio Ramos|      Spain|
|         J. Oblak|   Slovenia|
|   R. Lewandowski|     Poland|
|         T. Kroos|    Germany|
|         D. Godín|    Uruguay|
|      David Silva|      Spain|
|         N. Kanté|     France|
|        P. Dybala|  Argentina|
|          H. Kane|    England|
|     A. Griezmann|     France|
|    M. ter Stegen|    Germany|
|      T. Courtois|    Belgium|
|  Sergio Busquets|      Spain|
|        E. Cavani|    Uruguay|
|         M. Neuer|    Germany|
|        S. Agüero|  Argentina|
|     G. Chiellini|      Italy|
+-----------------+-----------+
only showing top 25 rows


// Display the total players count for each Nationality 
scala> df.groupBy("Nationality").agg(count("Name").alias("Total Player")).show(2000)
+--------------------+------------+
|         Nationality|Total Player|
+--------------------+-----------+
|                Chad|          2|
|              Russia|         79|
|            Paraguay|         85|
|             Senegal|        130|
|              Sweden|        397|
|              Guyana|          3|
|         Philippines|          2|
|             Eritrea|          2|
|                Fiji|          1|
|              Turkey|        303|
|                Iraq|          7|
|             Germany|       1198|
|      St Kitts Nevis|          3|
|             Comoros|          6|
|         Afghanistan|          4|
|         Ivory Coast|        100|
|              Jordan|          1|
|              Rwanda|          1|
|               Sudan|          3|
|              France|        914|
|              Greece|        102|
|              Kosovo|         33|
|Central African Rep.|          3|
|            DR Congo|         52|
|          Montserrat|          4|
|             Algeria|         60|
|                Togo|         12|
|   Equatorial Guinea|          5|
|            Slovakia|         54|
|           Argentina|        937|
|               Wales|        129|
|             Belgium|        260|
|              Angola|         15|
|            St Lucia|          1|
|             Ecuador|         43|
|               Qatar|          1|
|             Albania|         40|
|          Madagascar|         12|
|             Finland|         67|
|       New Caledonia|          1|
|               Ghana|        114|
|           Nicaragua|          2|
|                Peru|         37|
|               Benin|         15|
|        Sierra Leone|          6|
|       United States|        353|
|             Curacao|         14|
|               India|         30|
|             Belarus|          4|
|              Kuwait|          1|
|               Malta|          1|
|               Chile|        391|
|         Puerto Rico|          1|
|             Croatia|        126|
|             Burundi|          3|
|             Nigeria|        121|
|             Bolivia|         30|
|             Andorra|          1|
|               Gabon|         15|
|      Korea Republic|        335|
|               Italy|        702|
|            Suriname|          4|
|           Lithuania|          8|
|              Norway|        341|
|               Spain|       1072|
|                Cuba|          4|
|          Mauritania|          4|
|             Denmark|        336|
|               Niger|          3|
|            Barbados|          3|
|                Iran|         17|
|               Congo|         25|
|       Liechtenstein|          3|
|            Thailand|          5|
|             Morocco|         85|
|          Cape Verde|         19|
|              Panama|         15|
|           Hong Kong|          2|
|           Korea DPR|          4|
|             Ukraine|         73|
|           Venezuela|         67|
|             Iceland|         47|
|              Israel|         14|
|  Bosnia Herzegovina|         61|
|                Oman|          1|
|              Cyprus|          8|
|           Palestine|          1|
|             Uruguay|        149|
|              Mexico|        366|
|       FYR Macedonia|         20|
|          Montenegro|         23|
|            Zimbabwe|         13|
|             Estonia|         13|
|             Georgia|         26|
|           Indonesia|          1|
|           Guatemala|          3|
|                Guam|          1|
|               Libya|          4|
|          Azerbaijan|          5|
|             Grenada|          1|
|             Armenia|         10|
|             Tunisia|         32|
|             Liberia|          1|
|               Syria|          9|
|            Honduras|         16|
|        Saudi Arabia|        340|
|              Uganda|          6|
|             Namibia|          3|
|         Switzerland|        220|
|              Zambia|          9|
|            Ethiopia|          1|
|             Jamaica|         32|
|              Latvia|          6|
|United Arab Emirates|          1|
|         South Sudan|          1|
|              Guinea|         31|
|              Canada|         64|
|          Uzbekistan|          2|
|       Faroe Islands|          6|
|      Czech Republic|        100|
|          Mozambique|          4|
|              Brazil|        827|
|              Belize|          1|
|               Kenya|         10|
|              Gambia|         15|
|             Lebanon|          1|
|            Slovenia|         55|
|  Dominican Republic|          2|
|               Japan|        478|
|            Tanzania|          3|
|            Botswana|          1|
| Republic of Ireland|        368|
|          Luxembourg|          8|
|         New Zealand|         44|
|             England|       1662|
|   Trinidad & Tobago|          4|
|            China PR|        392|
|               Haiti|         10|
|              Poland|        350|
|            Portugal|        322|
|            Cameroon|         90|
|           Australia|        236|
|             Romania|         54|
|            Bulgaria|         32|
|             Austria|        298|
|               Egypt|         31|
|          Costa Rica|         30|
|         El Salvador|          5|
|          Kazakhstan|          4|
|              Serbia|        126|
|        Burkina Faso|         16|
|        South Africa|         71|
|             Bermuda|          2|
|            Scotland|        286|
|            Colombia|        618|
|    Northern Ireland|         80|
|             Hungary|         38|
|       Guinea Bissau|         15|
|   Antigua & Barbuda|          4|
|           Mauritius|          1|
|             Moldova|          5|
|         Netherlands|        453|
|                Mali|         43|
| S?o Tomé & Príncipe|          1|
+--------------------+-----------+


// Top 20 Nationalities in ASCENDING order 
scala> df.groupBy("Nationality").agg(count("Name").alias("Total Player")).orderBy("Nationality").show(
+------------------+------------+
|       Nationality|Total Player|
+------------------+------------+
|       Afghanistan|           4|
|           Albania|          40|
|           Algeria|          60|
|           Andorra|           1|
|            Angola|          15|
| Antigua & Barbuda|           4|
|         Argentina|         937|
|           Armenia|          10|
|         Australia|         236|
|           Austria|         298|
|        Azerbaijan|           5|
|          Barbados|           3|
|           Belarus|           4|
|           Belgium|         260|
|            Belize|           1|
|             Benin|          15|
|           Bermuda|           2|
|           Bolivia|          30|
|Bosnia Herzegovina|          61|
|          Botswana|           1|
+------------------+------------+


// Nationality wise descending order 
scala> df.groupBy("Nationality").agg(count("Name").alias("Total Player")).orderBy(desc("Nationality")).show(20)
+--------------------+------------+
|         Nationality|Total Player|
+--------------------+------------+
|            Zimbabwe|          13|
|              Zambia|           9|
|               Wales|         129|
|           Venezuela|          67|
|          Uzbekistan|           2|
|             Uruguay|         149|
|       United States|         353|
|United Arab Emirates|           1|
|             Ukraine|          73|
|              Uganda|           6|
|              Turkey|         303|
|             Tunisia|          32|
|   Trinidad & Tobago|           4|
|                Togo|          12|
|            Thailand|           5|
|            Tanzania|           3|
| S?o Tomé & Príncipe|           1|
|               Syria|           9|
|         Switzerland|         220|
|              Sweden|         397|
+--------------------+------------+
only showing top 20 rows

// Total Players for each Nationality in ASCending Order
scala> df.groupBy("Nationality").agg(count("Name").alias("TotalPlayer")).orderBy(desc("TotalPlayer")).show(20)
+-------------------+-----------+
|        Nationality|TotalPlayer|
+-------------------+-----------+
|            England|       1662|
|            Germany|       1198|
|              Spain|       1072|
|          Argentina|        937|
|             France|        914|
|             Brazil|        827|
|              Italy|        702|
|           Colombia|        618|
|              Japan|        478|
|        Netherlands|        453|
|             Sweden|        397|
|           China PR|        392|
|              Chile|        391|
|Republic of Ireland|        368|
|             Mexico|        366|
|      United States|        353|
|             Poland|        350|
|             Norway|        341|
|       Saudi Arabia|        340|
|            Denmark|        336|
+-------------------+-----------+
only showing top 20 rows


// Display Player Name, Nationality, Club informations

scala> df.select("Name","Nationality","Club").show()
+-----------------+-----------+-------------------+
|             Name|Nationality|               Club|
+-----------------+-----------+-------------------+
|         L. Messi|  Argentina|       FC Barcelona|
|Cristiano Ronaldo|   Portugal|           Juventus|
|        Neymar Jr|     Brazil|Paris Saint-Germain|
|           De Gea|      Spain|  Manchester United|
|     K. De Bruyne|    Belgium|    Manchester City|
|        E. Hazard|    Belgium|            Chelsea|
|        L. Modri?|    Croatia|        Real Madrid|
|        L. Suárez|    Uruguay|       FC Barcelona|
|     Sergio Ramos|      Spain|        Real Madrid|
|         J. Oblak|   Slovenia|    Atlético Madrid|
|   R. Lewandowski|     Poland|  FC Bayern München|
|         T. Kroos|    Germany|        Real Madrid|
|         D. Godín|    Uruguay|    Atlético Madrid|
|      David Silva|      Spain|    Manchester City|
|         N. Kanté|     France|            Chelsea|
|        P. Dybala|  Argentina|           Juventus|
|          H. Kane|    England|  Tottenham Hotspur|
|     A. Griezmann|     France|    Atlético Madrid|
|    M. ter Stegen|    Germany|       FC Barcelona|
|      T. Courtois|    Belgium|        Real Madrid|
+-----------------+-----------+-------------------+
only showing top 20 rows


// Age is Lesser than 30 
scala> df.select("ID","Name","Age","Nationality","Potential","Jersey Number","Club").filter("Age < 30").show()
+------+---------------+---+-----------+---------+-------------+-------------------+
|    ID|           Name|Age|Nationality|Potential|Jersey Number|               Club|
+------+---------------+---+-----------+---------+-------------+-------------------+
|190871|      Neymar Jr| 26|     Brazil|       93|           10|Paris Saint-Germain|
|193080|         De Gea| 27|      Spain|       93|            1|  Manchester United|
|192985|   K. De Bruyne| 27|    Belgium|       92|            7|    Manchester City|
|183277|      E. Hazard| 27|    Belgium|       91|           10|            Chelsea|
|200389|       J. Oblak| 25|   Slovenia|       93|            1|    Atlético Madrid|
|188545| R. Lewandowski| 29|     Poland|       90|            9|  FC Bayern München|
|182521|       T. Kroos| 28|    Germany|       90|            8|        Real Madrid|
|215914|       N. Kanté| 27|     France|       90|           13|            Chelsea|
|211110|      P. Dybala| 24|  Argentina|       94|           21|           Juventus|
|202126|        H. Kane| 24|    England|       91|            9|  Tottenham Hotspur|
|194765|   A. Griezmann| 27|     France|       90|            7|    Atlético Madrid|
|192448|  M. ter Stegen| 26|    Germany|       92|           22|       FC Barcelona|
|192119|    T. Courtois| 26|    Belgium|       90|            1|        Real Madrid|
|189511|Sergio Busquets| 29|      Spain|       89|            5|       FC Barcelona|
|231747|      K. Mbappé| 19|     France|       95|           10|Paris Saint-Germain|
|209331|       M. Salah| 26|      Egypt|       89|           10|          Liverpool|
|200145|       Casemiro| 26|     Brazil|       90|           14|        Real Madrid|
|198710|   J. Rodríguez| 26|   Colombia|       89|           10|  FC Bayern München|
|198219|     L. Insigne| 27|      Italy|       88|           10|             Napoli|
|197781|           Isco| 26|      Spain|       91|           22|        Real Madrid|
+------+---------------+---+-----------+---------+-------------+-------------------+
only showing top 20 rows

Thursday, 13 August 2020

Google Adsense Analytics using Spark with Scala

$ hdfs dfs -cat hdfs://localhost:8020/user/cloudera/adsense/adsense.csv | head
Date,Page views,Impressions,Clicks,Page RPM (USD),Impression RPM (USD),Active View Viewable,Estimated earnings (USD)
2010-02-25,61358,160401,525,1.04,0.4,,63.71
2011-07-03,60066,157661,774,0.62,0.24,,37.19
2011-04-03,59079,148407,339,0.5,0.2,,29.54
2011-07-04,55221,142543,802,0.88,0.34,,48.58
2011-11-21,50808,136381,786,1.04,0.39,,52.73
2011-11-28,48360,130869,996,1.12,0.41,,54.14
2011-07-08,49545,129778,806,0.94,0.36,,46.45
2011-11-20,49020,129486,622,0.88,0.33,,42.98
2010-12-18,45092,127493,289,1.19,0.42,,53.77


scala> val df  = spark.read.format("csv").option("header","True").option("inferSchema","True").load("hdfs://localhost:8020/user/cloudera/adsense/adsense.csv")

scala> df.printSchema()
root
 |-- Date: timestamp (nullable = true)
 |-- Page views: integer (nullable = true)
 |-- Impressions: integer (nullable = true)
 |-- Clicks: integer (nullable = true)
 |-- Page RPM (USD): double (nullable = true)
 |-- Impression RPM (USD): double (nullable = true)
 |-- Active View Viewable: string (nullable = true)
 |-- Estimated earnings (USD): double (nullable = true)



val dfAdsense = df.withColumnRenamed("Date","fldDate").withColumnRenamed("Page views","fldPageViews").withColumnRenamed("Impressions","fldImpressions").withColumnRenamed("Clicks","fldClicks").withColumnRenamed("Page RPM (USD)","fldPageRPM").withColumnRenamed("Impression RPM (USD)","fldimpressionRPM").withColumnRenamed("Active View Viewable","fldActiveViewViewable").withColumnRenamed("Estimated earnings (USD)","fldEstimatedEarnings")


scala> dfAdsense.printSchema()
root
 |-- fldDate: timestamp (nullable = true)
 |-- fldPageViews: integer (nullable = true)
 |-- fldImpressions: integer (nullable = true)
 |-- fldClicks: integer (nullable = true)
 |-- fldPageRPM: double (nullable = true)
 |-- fldimpressionRPM: double (nullable = true)
 |-- fldActiveViewViewable: string (nullable = true)
 |-- fldEstimatedEarnings: double (nullable = true)


// total row count before remove 0 earnings records
scala> dfAdsense.count
res3: Long = 2452

 
scala> val df = dfAdsense.filter($"fldEstimatedEarnings" =!= 0)
df: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [fldDate: timestamp, fldPageViews: int ... 6 more fields]


scala> print(df.count())
1511


scala> df.agg(sum("fldEstimatedEarnings")).show()
+-------------------------+
|sum(fldEstimatedEarnings)|
+-------------------------+
|       16995.739999999976|


scala> df.select("fldDate","fldEstimatedEarnings").sort("fldEstimatedEarnings").limit(10).show()
+-------------------+--------------------+
|            fldDate|fldEstimatedEarnings|
+-------------------+--------------------+
|2008-09-08 00:00:00|                0.01|
|2016-02-22 00:00:00|                0.01|
|2008-09-03 00:00:00|                0.01|
|2009-03-07 00:00:00|                0.01|
|2008-09-05 00:00:00|                0.01|
|2011-12-10 00:00:00|                0.01|
|2011-12-06 00:00:00|                0.01|
|2011-12-13 00:00:00|                0.01|
|2008-09-14 00:00:00|                0.01|
|2008-08-30 00:00:00|                0.01|
+-------------------+--------------------+


scala> df.select("fldDate","fldEstimatedEarnings").sort(desc("fldEstimatedEarnings")).limit(10).show()
+-------------------+--------------------+
|            fldDate|fldEstimatedEarnings|
+-------------------+--------------------+
|2010-02-25 00:00:00|               63.71|
|2011-11-28 00:00:00|               54.14|
|2010-12-18 00:00:00|               53.77|
|2011-11-21 00:00:00|               52.73|
|2010-12-19 00:00:00|               52.14|
|2011-07-13 00:00:00|               50.86|
|2011-12-02 00:00:00|               49.63|
|2011-06-24 00:00:00|               48.91|
|2011-09-27 00:00:00|                48.8|
|2011-07-04 00:00:00|               48.58|
+-------------------+--------------------+


scala> df.select("fldDate","fldEstimatedEarnings", "fldClicks","fldPageViews").sort(desc("fldEstimatedEarnings")).limit(10).show()
+-------------------+--------------------+---------+------------+
|            fldDate|fldEstimatedEarnings|fldClicks|fldPageViews|
+-------------------+--------------------+---------+------------+
|2010-02-25 00:00:00|               63.71|      525|       61358|
|2011-11-28 00:00:00|               54.14|      996|       48360|
|2010-12-18 00:00:00|               53.77|      289|       45092|
|2011-11-21 00:00:00|               52.73|      786|       50808|
|2010-12-19 00:00:00|               52.14|      202|       35461|
|2011-07-13 00:00:00|               50.86|      751|       40785|
|2011-12-02 00:00:00|               49.63|      842|       42972|
|2011-06-24 00:00:00|               48.91|      704|       33192|
|2011-09-27 00:00:00|                48.8|      680|       34525|
|2011-07-04 00:00:00|               48.58|      802|       55221|
+-------------------+--------------------+---------+------------+



scala> df.groupBy(year($"fldDate")).agg(sum($"fldEstimatedEarnings")).show()
+-------------+-------------------------+                                       
|year(fldDate)|sum(fldEstimatedEarnings)|
+-------------+-------------------------+
|         2015|      0.21000000000000002|
|         2013|       1.2799999999999998|
|         2014|                     0.03|
|         2012|                  2702.84|
|         2009|       1640.6799999999964|
|         2016|                     0.25|
|         2010|       2883.2799999999975|
|         2011|                  9392.38|
|         2008|        374.7300000000001|
|         2017|     0.060000000000000005|
+-------------+-------------------------+


scala> df.groupBy(year($"fldDate")).agg(round(sum($"fldEstimatedEarnings"),2)).show()
+-------------+-----------------------------------+                             
|year(fldDate)|round(sum(fldEstimatedEarnings), 2)|
+-------------+-----------------------------------+
|         2015|                               0.21|
|         2013|                               1.28|
|         2014|                               0.03|
|         2012|                            2702.84|
|         2009|                            1640.68|
|         2016|                               0.25|
|         2010|                            2883.28|
|         2011|                            9392.38|
|         2008|                             374.73|
|         2017|                               0.06|
+-------------+-----------------------------------+

scala> df.groupBy(year($"fldDate").alias("Year")).agg(round(sum($"fldEstimatedEarnings"),2).alias("Earnings")).show()
+----+--------+
|Year|Earnings|
+----+--------+
|2015|    0.21|
|2013|    1.28|
|2014|    0.03|
|2012| 2702.84|
|2009| 1640.68|
|2016|    0.25|
|2010| 2883.28|
|2011| 9392.38|
|2008|  374.73|
|2017|    0.06|
+----+--------+


scala> df.groupBy(year($"fldDate").alias("Year"), month($"fldDate").alias("Month")).agg(round(sum($"fldEstimatedEarnings"),2).alias("Earnings")).sort("Year","Month").show(200)
+----+-----+--------+                                                           
|Year|Month|Earnings|
+----+-----+--------+
|2008|    6|   35.76|
|2008|    7|   82.05|
|2008|    8|   27.17|
|2008|    9|   21.12|
|2008|   10|   65.87|
|2008|   11|   78.38|
|2008|   12|   64.38|
|2009|    1|   130.1|
|2009|    2|   26.69|
|2009|    3|   59.68|
|2009|    4|   94.23|
|2009|    5|  186.15|
|2009|    6|  248.88|
|2009|    7|  190.95|
|2009|    8|    81.7|
|2009|    9|   60.99|
|2009|   10|   206.2|
|2009|   11|  198.68|
|2009|   12|  156.43|
|2010|    1|  101.46|
|2010|    2|  316.61|
|2010|    3|  179.72|
|2010|    4|  142.26|
|2010|    5|  120.05|
|2010|    6|  132.18|
|2010|    7|  134.75|
|2010|    8|  137.77|
|2010|    9|  265.05|
|2010|   10|  267.31|
|2010|   11|   372.2|
|2010|   12|  713.92|
|2011|    1|  478.87|
|2011|    2|  551.54|
|2011|    3|   549.9|
|2011|    4|  683.07|
|2011|    5|  620.94|
|2011|    6|  969.76|
|2011|    7| 1149.31|
|2011|    8| 1057.64|
|2011|    9| 1039.34|
|2011|   10|  966.37|
|2011|   11| 1138.69|
|2011|   12|  186.95|
|2012|    1|    0.48|
|2012|    2|  256.25|
|2012|    3|  564.13|
|2012|    4|  524.13|
|2012|    5|  504.83|
|2012|    6|  370.12|
|2012|    7|   261.1|
|2012|    8|  219.39|
|2012|    9|    2.38|
|2012|   10|    0.02|
|2012|   12|    0.01|
|2013|    1|    0.01|
|2013|    3|    0.25|
|2013|    4|    0.11|
|2013|    5|    0.09|
|2013|    6|    0.25|
|2013|    7|    0.03|
|2013|    9|    0.09|
|2013|   10|    0.32|
|2013|   12|    0.13|
|2014|    2|    0.03|
|2015|    2|    0.08|
|2015|    4|    0.03|
|2015|    7|     0.1|
|2016|    2|    0.01|
|2016|    3|     0.2|
|2016|    9|    0.03|
|2016|   11|    0.01|
|2017|    6|    0.06|
+----+-----+--------+


// order by Earnings group by Year and Month
scala> df.groupBy(year($"fldDate").alias("Year"), month($"fldDate").alias("Month")).agg(round(sum($"fldEstimatedEarnings"),2).alias("Earnings")).sort(desc("Earnings")).show(200) 
+----+-----+--------+                                                           
|Year|Month|Earnings|
+----+-----+--------+
|2011|    7| 1149.31|
|2011|   11| 1138.69|
|2011|    8| 1057.64|
|2011|    9| 1039.34|
|2011|    6|  969.76|
|2011|   10|  966.37|
|2010|   12|  713.92|
|2011|    4|  683.07|
|2011|    5|  620.94|
|2012|    3|  564.13|
|2011|    2|  551.54|
|2011|    3|   549.9|
|2012|    4|  524.13|
|2012|    5|  504.83|
|2011|    1|  478.87|
|2010|   11|   372.2|
|2012|    6|  370.12|
|2010|    2|  316.61|
|2010|   10|  267.31|
|2010|    9|  265.05|
|2012|    7|   261.1|
|2012|    2|  256.25|
|2009|    6|  248.88|
|2012|    8|  219.39|
|2009|   10|   206.2|
|2009|   11|  198.68|
|2009|    7|  190.95|
|2011|   12|  186.95|
|2009|    5|  186.15|
|2010|    3|  179.72|
|2009|   12|  156.43|
|2010|    4|  142.26|
|2010|    8|  137.77|
|2010|    7|  134.75|
|2010|    6|  132.18|
|2009|    1|   130.1|
|2010|    5|  120.05|
|2010|    1|  101.46|
|2009|    4|   94.23|
|2008|    7|   82.05|
|2009|    8|    81.7|
|2008|   11|   78.38|
|2008|   10|   65.87|
|2008|   12|   64.38|
|2009|    9|   60.99|
|2009|    3|   59.68|
|2008|    6|   35.76|
|2008|    8|   27.17|
|2009|    2|   26.69|
|2008|    9|   21.12|
|2012|    9|    2.38|
|2012|    1|    0.48|
|2013|   10|    0.32|
|2013|    3|    0.25|
|2013|    6|    0.25|
|2016|    3|     0.2|
|2013|   12|    0.13|
|2013|    4|    0.11|
|2015|    7|     0.1|
|2013|    5|    0.09|
|2013|    9|    0.09|
|2015|    2|    0.08|
|2017|    6|    0.06|
|2013|    7|    0.03|
|2014|    2|    0.03|
|2015|    4|    0.03|
|2016|    9|    0.03|
|2012|   10|    0.02|
|2016|   11|    0.01|
|2012|   12|    0.01|
|2016|    2|    0.01|
|2013|    1|    0.01|
+----+-----+--------+

Monday, 3 August 2020

SQOOP Import Problem and Solution #3

#Problem : 2

Connect to MySQL database using SQOOP,
import all customers whose street name contains "Plaza"

Ex:
Hazy Mountain Plaza
Tawny Fox Plaza

Data description:
A MySQL instance is running on the localhost node.
In that instance, you will find customers table that contains
customer's data.

Installation : localhost
Database name : retail_db
Table Name : Customers
User Name : root 
Password : cloudera

Output requirement:

Place the customers files in HDFS directory
"user/cloudera/problem1/customers/txtdata"

Save output in text format with fields 
separated by a "*" and lines should be terminated by pipe.


Select the columns : customer_id,
customer_fname,customer_lname,
customer_street_name.



$ sqoop import \
-connect "jdbc:mysql://localhost/retail_db" \
-username root \
-password cloudera \
-table customers \
-where "customer_street like '%Plaza%'" \
-target-dir /user/cloudera/problem1/customers/textdata \
-columns "customer_id,customer_fname,customer_lname,customer_street"  


dfs dfs -ls /user/cloudera/problem1/customers/textdata
Found 5 items
-rw-r--r--   1 cloudera cloudera          0 2020-08-03 08:40 /user/cloudera/problem1/customers/textdata/_SUCCESS
-rw-r--r--   1 cloudera cloudera        974 2020-08-03 08:40 /user/cloudera/problem1/customers/textdata/part-m-00000
-rw-r--r--   1 cloudera cloudera       1165 2020-08-03 08:40 /user/cloudera/problem1/customers/textdata/part-m-00001
-rw-r--r--   1 cloudera cloudera        986 2020-08-03 08:40 /user/cloudera/problem1/customers/textdata/part-m-00002
-rw-r--r--   1 cloudera cloudera       1028 2020-08-03 08:40 /user/cloudera/problem1/customers/textdata/part-m-00003


$ hdfs dfs -cat /user/cloudera/problem1/customers/textdata/part-m-00000 | tail
1793,Mary,Burke,141 Dewy Plaza
1821,Mary,Trevino,6520 Quaking Quail Plaza
1891,Mary,Mora,1804 Misty Plaza
2053,Ronald,Smith,1025 Colonial Plaza
2247,Alice,Werner,1557 Heather Leaf Plaza
2517,Jose,Kane,5202 Emerald Island Plaza
2545,Larry,Patel,1688 Middle Panda Plaza
2575,Mary,Carroll,4979 Honey Plaza
2831,Mary,Padilla,3810 Crystal Plaza
2875,Mary,Roberts,2205 Velvet Plaza


 

SQOOP Import Problem and Solution #2

#Problem : 2

Connect to MySQL database using SQOOP,
import all customers lives in 'CA' state.

Data description:
A MySQL instance is running on the localhost node.
In that instance, you will find customers table that contains
customer's data.

Installation : localhost
Database name : retail_db
Table Name : Customers
User Name : root 
Password : cloudera

Output requirement:

Place the customers files in HDFS directory
"user/cloudera/problem1/customers_selected/avrodata"

Use avro format with Snappy compression

Select the columns : customer_id,customer_fname,customer_lname,customer_state.



$ sqoop import \
-connect "jdbc:mysql://localhost/retail_db" \
-username root \
-password cloudera \
-table customers \
-where "customer_state='CA'" \
-target-dir /user/cloudera/problem1/customers_selected/avrodata \
-columns "customer_id,customer_fname,customer_lname,customer_state" \
-compress \
-compression-codec snappy \
-as-avrodatafile


$ hdfs dfs -ls /user/cloudera/problem1/customers_selected/avrodata
Found 5 items
-rw-r--r--   1 cloudera cloudera          0 2020-08-03 08:24 /user/cloudera/problem1/customers_selected/avrodata/_SUCCESS
-rw-r--r--   1 cloudera cloudera       6508 2020-08-03 08:24 /user/cloudera/problem1/customers_selected/avrodata/part-m-00000.avro
-rw-r--r--   1 cloudera cloudera       6427 2020-08-03 08:24 /user/cloudera/problem1/customers_selected/avrodata/part-m-00001.avro
-rw-r--r--   1 cloudera cloudera       6419 2020-08-03 08:24 /user/cloudera/problem1/customers_selected/avrodata/part-m-00002.avro
-rw-r--r--   1 cloudera cloudera       7093 2020-08-03 08:24 /user/cloudera/problem1/customers_selected/avrodata/part-m-00003.avro


$ avro-tools tojson hdfs://localhost/user/cloudera/problem1/customers_selected/avrodata/part-m-00003.avro | head
log4j:WARN No appenders could be found for logger (org.apache.hadoop.metrics2.lib.MutableMetricsFactory).
log4j:WARN Please initialize the log4j system properly.
log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.
{"customer_id":{"int":9328},"customer_fname":{"string":"Mary"},"customer_lname":{"string":"Perez"},"customer_state":{"string":"CA"}}
{"customer_id":{"int":9333},"customer_fname":{"string":"Angela"},"customer_lname":{"string":"Mills"},"customer_state":{"string":"CA"}}
{"customer_id":{"int":9336},"customer_fname":{"string":"Janice"},"customer_lname":{"string":"Guzman"},"customer_state":{"string":"CA"}}
{"customer_id":{"int":9338},"customer_fname":{"string":"James"},"customer_lname":{"string":"Davis"},"customer_state":{"string":"CA"}}
{"customer_id":{"int":9340},"customer_fname":{"string":"Mary"},"customer_lname":{"string":"Smith"},"customer_state":{"string":"CA"}}
{"customer_id":{"int":9347},"customer_fname":{"string":"Mary"},"customer_lname":{"string":"Fuentes"},"customer_state":{"string":"CA"}}
{"customer_id":{"int":9352},"customer_fname":{"string":"Mary"},"customer_lname":{"string":"Lewis"},"customer_state":{"string":"CA"}}
{"customer_id":{"int":9357},"customer_fname":{"string":"Katherine"},"customer_lname":{"string":"Spence"},"customer_state":{"string":"CA"}}
{"customer_id":{"int":9363},"customer_fname":{"string":"Mary"},"customer_lname":{"string":"Simmons"},"customer_state":{"string":"CA"}}
{"customer_id":{"int":9375},"customer_fname":{"string":"George"},"customer_lname":{"string":"Smith"},"customer_state":{"string":"CA"}}

SQOOP Import problem and solution #1

SQOOP problem : #1
Connect to MySQL database using SQOOP, import all orders
from orders table whose order_status is COMPLETE.

Data Description:
A MySQL instance is running on the localhost.In that instance, you will find orders table that 
contains order's data.

Installation : LocalHost
Database Name : retail_db
Table Name : Orders
User Name : root
Password : cloudera

Output requirement:
Place the order's files in HDFS directory:
/user/cloudera/problem1/orders/parquetdata
Use parquet format with tab delimiter and snappy compression/

NULL values are represented as -1 for numbers and "NA" for strings.


Solution:
$ sqoop import \
-connect "jdbc:mysql://localhost/retail_db" \
-username root \
-password cloudera \
-table orders \
-target-dir /user/cloudera/problem1/orders/parquetdata \
-delete-target-dir \
-fields-terminated-by "\t" \
-where "order_status='COMPLETE'" \
-null-string "NA" \
-null-non-string -1 \
-compress \
-compression-codec snappy \
-as-parquetfile

$ hdfs dfs -ls /user/cloudera/problem1/orders/parquetdataFound 6 items
drwxr-xr-x   - cloudera cloudera          0 2020-08-03 08:09 /user/cloudera/problem1/orders/parquetdata/.metadata
drwxr-xr-x   - cloudera cloudera          0 2020-08-03 08:11 /user/cloudera/problem1/orders/parquetdata/.signals
-rw-r--r--   1 cloudera cloudera      50233 2020-08-03 08:11 /user/cloudera/problem1/orders/parquetdata/1fa590f9-281d-4d62-9650-ddfe04d7eae2.parquet
-rw-r--r--   1 cloudera cloudera      47508 2020-08-03 08:11 /user/cloudera/problem1/orders/parquetdata/217e22ee-25b2-41a6-8bf9-f6fee4c7d0c4.parquet
-rw-r--r--   1 cloudera cloudera      46231 2020-08-03 08:11 /user/cloudera/problem1/orders/parquetdata/826a8254-9700-4edd-afdb-b8fb58b2fd49.parquet
-rw-r--r--   1 cloudera cloudera      46406 2020-08-03 08:11 /user/cloudera/problem1/orders/parquetdata/84355f27-d1a4-4c1b-ac9d-84fd5139a912.parquet



// View the parquet file content using parquet-tools:

[cloudera@quickstart ~]$ parquet-tools head -n5 hdfs://localhost/user/cloudera/problem1/orders/parquetdata/1fa590f9-281d-4d62-9650-ddfe04d7eae2.parquet
order_id = 51665
order_date = 1402729200000
order_customer_id = 8645
order_status = COMPLETE

order_id = 51670
order_date = 1402729200000
order_customer_id = 9322
order_status = COMPLETE

order_id = 51671
order_date = 1402729200000
order_customer_id = 9000
order_status = COMPLETE

order_id = 51673
order_date = 1402729200000
order_customer_id = 7538
order_status = COMPLETE

order_id = 51677
order_date = 1402729200000
order_customer_id = 6321
order_status = COMPLETE

Flume - Simple Demo

// create a folder in hdfs : $ hdfs dfs -mkdir /user/flumeExa // Create a shell script which generates : Hadoop in real world <n>...