Friday, 29 May 2020

Google Adsense Data Analysis using Pyspark RDDs

import collections
from datetime import datetime 

Adsense = collections.namedtuple('Adsense',['fldDate','fldPageViews','fldImpressions','fldClicks','fldPageRPM','fldImpressionRPM','fldActiveViewViewable','fldEstimatedEarnings']) 

def parseAdsense(_row):
fields = _row.split(",")
_date = fields[0]
_pageViews = (int) (fields[1])
_impressions = (int) (fields[2])
_clicks = (int) (fields[3])
_pageRPM  =  fields[4]
_impressionRPM =  fields[5]
_activeviewviewable = fields[6]
_estimatedEarnings = (float) (fields[7])
_adsense = Adsense(_date,_pageViews,_impressions,_clicks,_pageRPM,_impressionRPM,_activeviewviewable,_estimatedEarnings)
return _adsense

#print(parseMovie("1::Toy Story (1995)::Animation|Children's|Comedy"))
adsense_r1 = spark.sparkContext.textFile("E:\\DataSets\\adsense_data.csv")
#print(adsense_r1.count())
#for j in adsense_r1.collect():
#    print(j)
headerInfo  = adsense_r1.first()
#print(headerInfo)

adsense_r2 = adsense_r1.filter(lambda x: headerInfo not in x)
#print(adsense_2.first())
#print(adsense_2.count())
#for j in adsense_r2.collect():
#    print(j)


adsense_r3 = adsense_r2.map(lambda x:(str)(x)).map(parseAdsense)
print(adsense_r3.count())

adsense_r4 = adsense_r3.filter(lambda x: x.fldEstimatedEarnings != 0)
print(adsense_r4.count())


2452
1511

#Find the total Earnings 
totalEarnings = adsense_r4.map(lambda x: x.fldEstimatedEarnings).reduce(lambda x,y : x+y)

print(totalEarnings) #16995.739999999994


#Find the top 10 Earnings without Date
adsense_r4.map(lambda x: x.fldEstimatedEarnings).sortBy(lambda x: x, False).take(10)

[63.71, 54.14, 53.77, 52.73, 52.14, 50.86, 49.63, 48.91, 48.8, 48.58]

#Find the top 10 Earnings with Date
adsense_r4.map(lambda x: (x.fldDate, x.fldEstimatedEarnings)).sortBy(lambda x: x[1], False).take(10)

[('2010-02-25', 63.71),
 ('2011-11-28', 54.14),
 ('2010-12-18', 53.77),
 ('2011-11-21', 52.73),
 ('2010-12-19', 52.14),
 ('2011-07-13', 50.86),
 ('2011-12-02', 49.63),
 ('2011-06-24', 48.91),
 ('2011-09-27', 48.8),
 ('2011-07-04', 48.58)]
 
 
 #Find the least 10 Earnings without Date
adsense_r4.map(lambda x: x.fldEstimatedEarnings).sortBy(lambda x: x, True).take(10)

[0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01]


#Find the least 10 Earnings with Date
adsense_r4.map(lambda x: (x.fldDate, x.fldEstimatedEarnings)).sortBy(lambda x: x[1], True).take(10)

[('2009-02-09', 0.01),
 ('2009-02-17', 0.01),
 ('2011-12-10', 0.01),
 ('2009-03-07', 0.01),
 ('2011-12-13', 0.01),
 ('2008-09-11', 0.01),
 ('2008-09-08', 0.01),
 ('2008-09-03', 0.01),
 ('2008-09-05', 0.01),
 ('2011-12-06', 0.01)]

#in which dates we got Estimated Earnings >= 40
adsense_r4.filter(lambda x: x.fldEstimatedEarnings >= 40).map(lambda x: (x.fldDate, x.fldEstimatedEarnings)).sortBy(lambda x: x[1], False).collect()

[('2010-02-25', 63.71),
 ('2011-11-28', 54.14),
 ('2010-12-18', 53.77),
 ('2011-11-21', 52.73),
 ('2010-12-19', 52.14),
 ('2011-07-13', 50.86),
 ('2011-12-02', 49.63),
 ('2011-06-24', 48.91),
 ('2011-09-27', 48.8),
 ('2011-07-04', 48.58),
 ('2011-06-23', 47.93),
 ('2011-07-12', 46.85),
 ('2011-11-30', 46.69),
 ('2011-07-08', 46.45),
 ('2010-12-14', 45.68),
 ('2011-06-26', 45.5),
 ('2011-10-19', 45.22),
 ('2011-06-30', 45.02),
 ('2011-06-22', 44.82),
 ('2011-06-25', 44.58),
 ('2011-07-16', 44.56),
 ('2011-07-09', 44.45),
 ('2011-11-23', 44.23),
 ('2011-07-27', 44.0),
 ('2011-11-22', 43.78),
 ('2011-09-21', 43.59),
 ('2011-11-24', 43.48),
 ('2011-07-11', 43.33),
 ('2011-11-29', 43.27),
 ('2011-07-05', 43.08),
 ('2011-09-18', 43.02),
 ('2011-11-20', 42.98),
 ('2011-09-13', 42.29),
 ('2011-09-17', 42.28),
 ('2011-08-09', 42.17),
 ('2011-10-27', 42.08),
 ('2011-09-20', 41.96),
 ('2011-07-23', 41.6),
 ('2011-09-23', 41.5),
 ('2011-11-15', 41.43),
 ('2011-10-18', 41.4),
 ('2011-07-14', 41.36),
 ('2011-07-10', 41.21),
 ('2011-11-17', 41.15),
 ('2011-09-19', 40.69),
 ('2011-11-26', 40.49),
 ('2011-08-05', 40.05)]
 
 
 How many days we got Estimated Earnings >= 40?
 
 adsense_r4.filter(lambda x: x.fldEstimatedEarnings >= 40).count()


47


>10 and <= 50
adsense_r4.filter(lambda x: x.fldEstimatedEarnings >= 10 and x.fldEstimatedEarnings <= 50).map(lambda x: (x.fldDate, x.fldEstimatedEarnings)).sortBy(lambda x: x[1], False).collect()


>10 and <= 50 - count
adsense_r4.filter(lambda x: x.fldEstimatedEarnings >= 10 and x.fldEstimatedEarnings <= 50).count()

583



#top 10 page views 

adsense_r4.map(lambda x: (x.fldDate, x.fldPageViews)).sortBy(lambda x:x[1],False).take(10)
[('2010-02-25', 61358),
 ('2011-07-03', 60066),
 ('2011-04-03', 59079),
 ('2011-07-04', 55221),
 ('2011-11-21', 50808),
 ('2011-07-08', 49545),
 ('2011-11-20', 49020),
 ('2011-11-13', 48613),
 ('2011-11-28', 48360),
 ('2011-11-22', 47939)]
 
 #top 10 clicks 
 adsense_r4.map(lambda x: (x.fldDate, x.fldClicks)).sortBy(lambda x:x[1],False).take(10)
 
 [('2011-11-26', 1044),
 ('2011-11-25', 1026),
 ('2011-11-28', 996),
 ('2011-11-29', 968),
 ('2011-11-18', 924),
 ('2011-11-16', 920),
 ('2011-11-17', 917),
 ('2011-11-12', 912),
 ('2011-08-06', 878),
 ('2011-11-13', 873)]
 
 # no of clicks >= 800 - Descending order
 adsense_r4.filter(lambda x: x.fldClicks >= 800).map(lambda x: (x.fldDate, x.fldClicks)).sortBy(lambda x: x[1],False).collect()
 
[('2011-11-26', 1044),
 ('2011-11-25', 1026),
 ('2011-11-28', 996),
 ('2011-11-29', 968),
 ('2011-11-18', 924),
 ('2011-11-16', 920),
 ('2011-11-17', 917),
 ('2011-11-12', 912),
 ('2011-08-06', 878),
 ('2011-11-13', 873),
 ('2011-11-30', 849),
 ('2011-07-23', 845),
 ('2011-12-02', 842),
 ('2011-11-15', 829),
 ('2011-11-27', 823),
 ('2011-07-09', 812),
 ('2011-07-08', 806),
 ('2011-07-04', 802),
 ('2011-11-19', 801),
 ('2011-11-11', 800)]
 
 #no of clicks >= 800 - Ascending Order
 
 adsense_r4.filter(lambda x: x.fldClicks >= 800).map(lambda x: (x.fldDate, x.fldClicks)).sortBy(lambda x: x[1],True).collect()
 
 [('2011-11-11', 800),
 ('2011-11-19', 801),
 ('2011-07-04', 802),
 ('2011-07-08', 806),
 ('2011-07-09', 812),
 ('2011-11-27', 823),
 ('2011-11-15', 829),
 ('2011-12-02', 842),
 ('2011-07-23', 845),
 ('2011-11-30', 849),
 ('2011-11-13', 873),
 ('2011-08-06', 878),
 ('2011-11-12', 912),
 ('2011-11-17', 917),
 ('2011-11-16', 920),
 ('2011-11-18', 924),
 ('2011-11-29', 968),
 ('2011-11-28', 996),
 ('2011-11-25', 1026),
 ('2011-11-26', 1044)]
 
 #How many days we got clicks >= 800
 adsense_r4.filter(lambda x: x.fldClicks >= 800).count()
 
 20
 
 #Total number of clicks where individual day's click >= 800
 adsense_r4.filter(lambda x: x.fldClicks >= 800).map(lambda x: x.fldClicks).reduce(lambda a,b:a+b)
 17667

#Total number of clicks all the day 
adsense_r4.map(lambda x: x.fldClicks).reduce(lambda m,n: m+n)

248190



#Total number of page views all the day 
adsense_r4.map(lambda x: x.fldPageViews).reduce(lambda m,n: m+n)

18626239

#Total days when page views >= 50000
adsense_r4.filter(lambda x: x.fldPageViews >= 50000).count()
5

#Total days when page views >= 10000
adsense_r4.filter(lambda x: x.fldPageViews >= 10000).count()

649

No comments:

Post a Comment

Flume - Simple Demo

// create a folder in hdfs : $ hdfs dfs -mkdir /user/flumeExa // Create a shell script which generates : Hadoop in real world <n>...