// FIFA 2019 Dataset Analytics with Spark and Scala
scala> val df = spark.read.format("csv").option("header","True").option("inferSchema","True").load("D:\\Ex\\FIFA\\data.csv")
df: org.apache.spark.sql.DataFrame = [ID: int, Name: int ... 86 more fields]
scala> df.printSchema()
root
|-- _c0: integer (nullable = true)
|-- ID: integer (nullable = true)
|-- Name: integer (nullable = true)
|-- Age: string (nullable = true)
|-- Photo: integer (nullable = true)
|-- Nationality: string (nullable = true)
|-- Flag: string (nullable = true)
|-- Overall: string (nullable = true)
|-- Potential: integer (nullable = true)
|-- Club: integer (nullable = true)
|-- Club Logo: string (nullable = true)
|-- Value: string (nullable = true)
|-- Wage: string (nullable = true)
|-- Special: string (nullable = true)
|-- Preferred Foot: integer (nullable = true)
|-- International Reputation: string (nullable = true)
|-- Weak Foot: integer (nullable = true)
|-- Skill Moves: integer (nullable = true)
|-- Work Rate: integer (nullable = true)
|-- Body Type: string (nullable = true)
|-- Real Face: string (nullable = true)
|-- Position: string (nullable = true)
|-- Jersey Number: string (nullable = true)
|-- Joined: integer (nullable = true)
|-- Loaned From: string (nullable = true)
|-- Contract Valid Until: string (nullable = true)
|-- Height: string (nullable = true)
|-- Weight: string (nullable = true)
|-- LS: string (nullable = true)
|-- ST: string (nullable = true)
|-- RS: string (nullable = true)
|-- LW: string (nullable = true)
|-- LF: string (nullable = true)
|-- CF: string (nullable = true)
|-- RF: string (nullable = true)
|-- RW: string (nullable = true)
|-- LAM: string (nullable = true)
|-- CAM: string (nullable = true)
|-- RAM: string (nullable = true)
|-- LM: string (nullable = true)
|-- LCM: string (nullable = true)
|-- CM: string (nullable = true)
|-- RCM: string (nullable = true)
|-- RM: string (nullable = true)
|-- LWB: string (nullable = true)
|-- LDM: string (nullable = true)
|-- CDM: string (nullable = true)
|-- RDM: string (nullable = true)
|-- RWB: string (nullable = true)
|-- LB: string (nullable = true)
|-- LCB: string (nullable = true)
|-- CB: string (nullable = true)
|-- RCB: string (nullable = true)
|-- RB: string (nullable = true)
|-- Crossing: string (nullable = true)
|-- Finishing: integer (nullable = true)
|-- HeadingAccuracy: integer (nullable = true)
|-- ShortPassing: integer (nullable = true)
|-- Volleys: integer (nullable = true)
|-- Dribbling: integer (nullable = true)
|-- Curve: integer (nullable = true)
|-- FKAccuracy: integer (nullable = true)
|-- LongPassing: integer (nullable = true)
|-- BallControl: integer (nullable = true)
|-- Acceleration: integer (nullable = true)
|-- SprintSpeed: integer (nullable = true)
|-- Agility: integer (nullable = true)
|-- Reactions: integer (nullable = true)
|-- Balance: integer (nullable = true)
|-- ShotPower: integer (nullable = true)
|-- Jumping: integer (nullable = true)
|-- Stamina: integer (nullable = true)
|-- Strength: integer (nullable = true)
|-- LongShots: integer (nullable = true)
|-- Aggression: integer (nullable = true)
|-- Interceptions: integer (nullable = true)
|-- Positioning: integer (nullable = true)
|-- Vision: integer (nullable = true)
|-- Penalties: integer (nullable = true)
|-- Composure: integer (nullable = true)
|-- Marking: integer (nullable = true)
|-- StandingTackle: integer (nullable = true)
|-- SlidingTackle: integer (nullable = true)
|-- GKDiving: integer (nullable = true)
|-- GKHandling: integer (nullable = true)
|-- GKKicking: integer (nullable = true)
|-- GKPositioning: integer (nullable = true)
|-- GKReflexes: integer (nullable = true)
|-- Release Clause: integer (nullable = true)
// Total Row Count
scala> df.count()
res1: Long = 18207
// Display all the column names
scala> df.columns.foreach(println)
_c0
ID
Name
Age
Photo
Nationality
Flag
Overall
Potential
Club
Club Logo
Value
Wage
Special
Preferred Foot
International Reputation
Weak Foot
Skill Moves
Work Rate
Body Type
Real Face
Position
Jersey Number
Joined
Loaned From
Contract Valid Until
Height
Weight
LS
ST
RS
LW
LF
CF
RF
RW
LAM
CAM
RAM
LM
LCM
CM
RCM
RM
LWB
LDM
CDM
RDM
RWB
LB
LCB
CB
RCB
RB
Crossing
Finishing
HeadingAccuracy
ShortPassing
Volleys
Dribbling
Curve
FKAccuracy
LongPassing
BallControl
Acceleration
SprintSpeed
Agility
Reactions
Balance
ShotPower
Jumping
Stamina
Strength
LongShots
Aggression
Interceptions
Positioning
Vision
Penalties
Composure
Marking
StandingTackle
SlidingTackle
GKDiving
GKHandling
GKKicking
GKPositioning
GKReflexes
Release Clause
// Total number of columns
scala> df.columns.length
res4: Int = 88
scala> df.columns.size
res5: Int = 88
scala> df.describe("Value").show()
+-------+-----+
|summary|Value|
+-------+-----+
| count|18207|
| mean| null|
| stddev| null|
| min| ?0|
| max| ?9M|
+-------+-----+
// Display Name and Nationality
scala> df.select("Name","Nationality").show(25)
+-----------------+-----------+
| Name|Nationality|
+-----------------+-----------+
| L. Messi| Argentina|
|Cristiano Ronaldo| Portugal|
| Neymar Jr| Brazil|
| De Gea| Spain|
| K. De Bruyne| Belgium|
| E. Hazard| Belgium|
| L. Modri?| Croatia|
| L. Suárez| Uruguay|
| Sergio Ramos| Spain|
| J. Oblak| Slovenia|
| R. Lewandowski| Poland|
| T. Kroos| Germany|
| D. Godín| Uruguay|
| David Silva| Spain|
| N. Kanté| France|
| P. Dybala| Argentina|
| H. Kane| England|
| A. Griezmann| France|
| M. ter Stegen| Germany|
| T. Courtois| Belgium|
| Sergio Busquets| Spain|
| E. Cavani| Uruguay|
| M. Neuer| Germany|
| S. Agüero| Argentina|
| G. Chiellini| Italy|
+-----------------+-----------+
only showing top 25 rows
// Display the total players count for each Nationality
scala> df.groupBy("Nationality").agg(count("Name").alias("Total Player")).show(2000)
+--------------------+------------+
| Nationality|Total Player|
+--------------------+-----------+
| Chad| 2|
| Russia| 79|
| Paraguay| 85|
| Senegal| 130|
| Sweden| 397|
| Guyana| 3|
| Philippines| 2|
| Eritrea| 2|
| Fiji| 1|
| Turkey| 303|
| Iraq| 7|
| Germany| 1198|
| St Kitts Nevis| 3|
| Comoros| 6|
| Afghanistan| 4|
| Ivory Coast| 100|
| Jordan| 1|
| Rwanda| 1|
| Sudan| 3|
| France| 914|
| Greece| 102|
| Kosovo| 33|
|Central African Rep.| 3|
| DR Congo| 52|
| Montserrat| 4|
| Algeria| 60|
| Togo| 12|
| Equatorial Guinea| 5|
| Slovakia| 54|
| Argentina| 937|
| Wales| 129|
| Belgium| 260|
| Angola| 15|
| St Lucia| 1|
| Ecuador| 43|
| Qatar| 1|
| Albania| 40|
| Madagascar| 12|
| Finland| 67|
| New Caledonia| 1|
| Ghana| 114|
| Nicaragua| 2|
| Peru| 37|
| Benin| 15|
| Sierra Leone| 6|
| United States| 353|
| Curacao| 14|
| India| 30|
| Belarus| 4|
| Kuwait| 1|
| Malta| 1|
| Chile| 391|
| Puerto Rico| 1|
| Croatia| 126|
| Burundi| 3|
| Nigeria| 121|
| Bolivia| 30|
| Andorra| 1|
| Gabon| 15|
| Korea Republic| 335|
| Italy| 702|
| Suriname| 4|
| Lithuania| 8|
| Norway| 341|
| Spain| 1072|
| Cuba| 4|
| Mauritania| 4|
| Denmark| 336|
| Niger| 3|
| Barbados| 3|
| Iran| 17|
| Congo| 25|
| Liechtenstein| 3|
| Thailand| 5|
| Morocco| 85|
| Cape Verde| 19|
| Panama| 15|
| Hong Kong| 2|
| Korea DPR| 4|
| Ukraine| 73|
| Venezuela| 67|
| Iceland| 47|
| Israel| 14|
| Bosnia Herzegovina| 61|
| Oman| 1|
| Cyprus| 8|
| Palestine| 1|
| Uruguay| 149|
| Mexico| 366|
| FYR Macedonia| 20|
| Montenegro| 23|
| Zimbabwe| 13|
| Estonia| 13|
| Georgia| 26|
| Indonesia| 1|
| Guatemala| 3|
| Guam| 1|
| Libya| 4|
| Azerbaijan| 5|
| Grenada| 1|
| Armenia| 10|
| Tunisia| 32|
| Liberia| 1|
| Syria| 9|
| Honduras| 16|
| Saudi Arabia| 340|
| Uganda| 6|
| Namibia| 3|
| Switzerland| 220|
| Zambia| 9|
| Ethiopia| 1|
| Jamaica| 32|
| Latvia| 6|
|United Arab Emirates| 1|
| South Sudan| 1|
| Guinea| 31|
| Canada| 64|
| Uzbekistan| 2|
| Faroe Islands| 6|
| Czech Republic| 100|
| Mozambique| 4|
| Brazil| 827|
| Belize| 1|
| Kenya| 10|
| Gambia| 15|
| Lebanon| 1|
| Slovenia| 55|
| Dominican Republic| 2|
| Japan| 478|
| Tanzania| 3|
| Botswana| 1|
| Republic of Ireland| 368|
| Luxembourg| 8|
| New Zealand| 44|
| England| 1662|
| Trinidad & Tobago| 4|
| China PR| 392|
| Haiti| 10|
| Poland| 350|
| Portugal| 322|
| Cameroon| 90|
| Australia| 236|
| Romania| 54|
| Bulgaria| 32|
| Austria| 298|
| Egypt| 31|
| Costa Rica| 30|
| El Salvador| 5|
| Kazakhstan| 4|
| Serbia| 126|
| Burkina Faso| 16|
| South Africa| 71|
| Bermuda| 2|
| Scotland| 286|
| Colombia| 618|
| Northern Ireland| 80|
| Hungary| 38|
| Guinea Bissau| 15|
| Antigua & Barbuda| 4|
| Mauritius| 1|
| Moldova| 5|
| Netherlands| 453|
| Mali| 43|
| S?o Tomé & Príncipe| 1|
+--------------------+-----------+
// Top 20 Nationalities in ASCENDING order
scala> df.groupBy("Nationality").agg(count("Name").alias("Total Player")).orderBy("Nationality").show(
+------------------+------------+
| Nationality|Total Player|
+------------------+------------+
| Afghanistan| 4|
| Albania| 40|
| Algeria| 60|
| Andorra| 1|
| Angola| 15|
| Antigua & Barbuda| 4|
| Argentina| 937|
| Armenia| 10|
| Australia| 236|
| Austria| 298|
| Azerbaijan| 5|
| Barbados| 3|
| Belarus| 4|
| Belgium| 260|
| Belize| 1|
| Benin| 15|
| Bermuda| 2|
| Bolivia| 30|
|Bosnia Herzegovina| 61|
| Botswana| 1|
+------------------+------------+
// Nationality wise descending order
scala> df.groupBy("Nationality").agg(count("Name").alias("Total Player")).orderBy(desc("Nationality")).show(20)
+--------------------+------------+
| Nationality|Total Player|
+--------------------+------------+
| Zimbabwe| 13|
| Zambia| 9|
| Wales| 129|
| Venezuela| 67|
| Uzbekistan| 2|
| Uruguay| 149|
| United States| 353|
|United Arab Emirates| 1|
| Ukraine| 73|
| Uganda| 6|
| Turkey| 303|
| Tunisia| 32|
| Trinidad & Tobago| 4|
| Togo| 12|
| Thailand| 5|
| Tanzania| 3|
| S?o Tomé & Príncipe| 1|
| Syria| 9|
| Switzerland| 220|
| Sweden| 397|
+--------------------+------------+
only showing top 20 rows
// Total Players for each Nationality in ASCending Order
scala> df.groupBy("Nationality").agg(count("Name").alias("TotalPlayer")).orderBy(desc("TotalPlayer")).show(20)
+-------------------+-----------+
| Nationality|TotalPlayer|
+-------------------+-----------+
| England| 1662|
| Germany| 1198|
| Spain| 1072|
| Argentina| 937|
| France| 914|
| Brazil| 827|
| Italy| 702|
| Colombia| 618|
| Japan| 478|
| Netherlands| 453|
| Sweden| 397|
| China PR| 392|
| Chile| 391|
|Republic of Ireland| 368|
| Mexico| 366|
| United States| 353|
| Poland| 350|
| Norway| 341|
| Saudi Arabia| 340|
| Denmark| 336|
+-------------------+-----------+
only showing top 20 rows
// Display Player Name, Nationality, Club informations
scala> df.select("Name","Nationality","Club").show()
+-----------------+-----------+-------------------+
| Name|Nationality| Club|
+-----------------+-----------+-------------------+
| L. Messi| Argentina| FC Barcelona|
|Cristiano Ronaldo| Portugal| Juventus|
| Neymar Jr| Brazil|Paris Saint-Germain|
| De Gea| Spain| Manchester United|
| K. De Bruyne| Belgium| Manchester City|
| E. Hazard| Belgium| Chelsea|
| L. Modri?| Croatia| Real Madrid|
| L. Suárez| Uruguay| FC Barcelona|
| Sergio Ramos| Spain| Real Madrid|
| J. Oblak| Slovenia| Atlético Madrid|
| R. Lewandowski| Poland| FC Bayern München|
| T. Kroos| Germany| Real Madrid|
| D. Godín| Uruguay| Atlético Madrid|
| David Silva| Spain| Manchester City|
| N. Kanté| France| Chelsea|
| P. Dybala| Argentina| Juventus|
| H. Kane| England| Tottenham Hotspur|
| A. Griezmann| France| Atlético Madrid|
| M. ter Stegen| Germany| FC Barcelona|
| T. Courtois| Belgium| Real Madrid|
+-----------------+-----------+-------------------+
only showing top 20 rows
// Age is Lesser than 30
scala> df.select("ID","Name","Age","Nationality","Potential","Jersey Number","Club").filter("Age < 30").show()
+------+---------------+---+-----------+---------+-------------+-------------------+
| ID| Name|Age|Nationality|Potential|Jersey Number| Club|
+------+---------------+---+-----------+---------+-------------+-------------------+
|190871| Neymar Jr| 26| Brazil| 93| 10|Paris Saint-Germain|
|193080| De Gea| 27| Spain| 93| 1| Manchester United|
|192985| K. De Bruyne| 27| Belgium| 92| 7| Manchester City|
|183277| E. Hazard| 27| Belgium| 91| 10| Chelsea|
|200389| J. Oblak| 25| Slovenia| 93| 1| Atlético Madrid|
|188545| R. Lewandowski| 29| Poland| 90| 9| FC Bayern München|
|182521| T. Kroos| 28| Germany| 90| 8| Real Madrid|
|215914| N. Kanté| 27| France| 90| 13| Chelsea|
|211110| P. Dybala| 24| Argentina| 94| 21| Juventus|
|202126| H. Kane| 24| England| 91| 9| Tottenham Hotspur|
|194765| A. Griezmann| 27| France| 90| 7| Atlético Madrid|
|192448| M. ter Stegen| 26| Germany| 92| 22| FC Barcelona|
|192119| T. Courtois| 26| Belgium| 90| 1| Real Madrid|
|189511|Sergio Busquets| 29| Spain| 89| 5| FC Barcelona|
|231747| K. Mbappé| 19| France| 95| 10|Paris Saint-Germain|
|209331| M. Salah| 26| Egypt| 89| 10| Liverpool|
|200145| Casemiro| 26| Brazil| 90| 14| Real Madrid|
|198710| J. Rodríguez| 26| Colombia| 89| 10| FC Bayern München|
|198219| L. Insigne| 27| Italy| 88| 10| Napoli|
|197781| Isco| 26| Spain| 91| 22| Real Madrid|
+------+---------------+---+-----------+---------+-------------+-------------------+
only showing top 20 rows
No comments:
Post a Comment