Sankara's Big Data Notes: Google Adsense Data Analysis using Pyspark RDDs

import collections

from datetime import datetime

Adsense = collections.namedtuple('Adsense',['fldDate','fldPageViews','fldImpressions','fldClicks','fldPageRPM','fldImpressionRPM','fldActiveViewViewable','fldEstimatedEarnings'])

def parseAdsense(_row):

fields = _row.split(",")

_date = fields[0]

_pageViews = (int) (fields[1])

_impressions = (int) (fields[2])

_clicks = (int) (fields[3])

_pageRPM = fields[4]

_impressionRPM = fields[5]

_activeviewviewable = fields[6]

_estimatedEarnings = (float) (fields[7])

_adsense = Adsense(_date,_pageViews,_impressions,_clicks,_pageRPM,_impressionRPM,_activeviewviewable,_estimatedEarnings)

return _adsense

#print(parseMovie("1::Toy Story (1995)::Animation|Children's|Comedy"))

adsense_r1 = spark.sparkContext.textFile("E:\\DataSets\\adsense_data.csv")

#print(adsense_r1.count())

#for j in adsense_r1.collect():

# print(j)

headerInfo = adsense_r1.first()

#print(headerInfo)

adsense_r2 = adsense_r1.filter(lambda x: headerInfo not in x)

#print(adsense_2.first())

#print(adsense_2.count())

#for j in adsense_r2.collect():

# print(j)

adsense_r3 = adsense_r2.map(lambda x:(str)(x)).map(parseAdsense)

print(adsense_r3.count())

adsense_r4 = adsense_r3.filter(lambda x: x.fldEstimatedEarnings != 0)

print(adsense_r4.count())

2452

1511

#Find the total Earnings

totalEarnings = adsense_r4.map(lambda x: x.fldEstimatedEarnings).reduce(lambda x,y : x+y)

print(totalEarnings) #16995.739999999994

#Find the top 10 Earnings without Date

adsense_r4.map(lambda x: x.fldEstimatedEarnings).sortBy(lambda x: x, False).take(10)

[63.71, 54.14, 53.77, 52.73, 52.14, 50.86, 49.63, 48.91, 48.8, 48.58]

#Find the top 10 Earnings with Date

adsense_r4.map(lambda x: (x.fldDate, x.fldEstimatedEarnings)).sortBy(lambda x: x[1], False).take(10)

[('2010-02-25', 63.71),

('2011-11-28', 54.14),

('2010-12-18', 53.77),

('2011-11-21', 52.73),

('2010-12-19', 52.14),

('2011-07-13', 50.86),

('2011-12-02', 49.63),

('2011-06-24', 48.91),

('2011-09-27', 48.8),

('2011-07-04', 48.58)]

#Find the least 10 Earnings without Date

adsense_r4.map(lambda x: x.fldEstimatedEarnings).sortBy(lambda x: x, True).take(10)

[0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01]

#Find the least 10 Earnings with Date

adsense_r4.map(lambda x: (x.fldDate, x.fldEstimatedEarnings)).sortBy(lambda x: x[1], True).take(10)

[('2009-02-09', 0.01),

('2009-02-17', 0.01),

('2011-12-10', 0.01),

('2009-03-07', 0.01),

('2011-12-13', 0.01),

('2008-09-11', 0.01),

('2008-09-08', 0.01),

('2008-09-03', 0.01),

('2008-09-05', 0.01),

('2011-12-06', 0.01)]

#in which dates we got Estimated Earnings >= 40

adsense_r4.filter(lambda x: x.fldEstimatedEarnings >= 40).map(lambda x: (x.fldDate, x.fldEstimatedEarnings)).sortBy(lambda x: x[1], False).collect()

[('2010-02-25', 63.71),

('2011-11-28', 54.14),

('2010-12-18', 53.77),

('2011-11-21', 52.73),

('2010-12-19', 52.14),

('2011-07-13', 50.86),

('2011-12-02', 49.63),

('2011-06-24', 48.91),

('2011-09-27', 48.8),

('2011-07-04', 48.58),

('2011-06-23', 47.93),

('2011-07-12', 46.85),

('2011-11-30', 46.69),

('2011-07-08', 46.45),

('2010-12-14', 45.68),

('2011-06-26', 45.5),

('2011-10-19', 45.22),

('2011-06-30', 45.02),

('2011-06-22', 44.82),

('2011-06-25', 44.58),

('2011-07-16', 44.56),

('2011-07-09', 44.45),

('2011-11-23', 44.23),

('2011-07-27', 44.0),

('2011-11-22', 43.78),

('2011-09-21', 43.59),

('2011-11-24', 43.48),

('2011-07-11', 43.33),

('2011-11-29', 43.27),

('2011-07-05', 43.08),

('2011-09-18', 43.02),

('2011-11-20', 42.98),

('2011-09-13', 42.29),

('2011-09-17', 42.28),

('2011-08-09', 42.17),

('2011-10-27', 42.08),

('2011-09-20', 41.96),

('2011-07-23', 41.6),

('2011-09-23', 41.5),

('2011-11-15', 41.43),

('2011-10-18', 41.4),

('2011-07-14', 41.36),

('2011-07-10', 41.21),

('2011-11-17', 41.15),

('2011-09-19', 40.69),

('2011-11-26', 40.49),

('2011-08-05', 40.05)]

How many days we got Estimated Earnings >= 40?

adsense_r4.filter(lambda x: x.fldEstimatedEarnings >= 40).count()

>10 and <= 50

adsense_r4.filter(lambda x: x.fldEstimatedEarnings >= 10 and x.fldEstimatedEarnings <= 50).map(lambda x: (x.fldDate, x.fldEstimatedEarnings)).sortBy(lambda x: x[1], False).collect()

>10 and <= 50 - count

adsense_r4.filter(lambda x: x.fldEstimatedEarnings >= 10 and x.fldEstimatedEarnings <= 50).count()

583

#top 10 page views

adsense_r4.map(lambda x: (x.fldDate, x.fldPageViews)).sortBy(lambda x:x[1],False).take(10)

[('2010-02-25', 61358),

('2011-07-03', 60066),

('2011-04-03', 59079),

('2011-07-04', 55221),

('2011-11-21', 50808),

('2011-07-08', 49545),

('2011-11-20', 49020),

('2011-11-13', 48613),

('2011-11-28', 48360),

('2011-11-22', 47939)]

#top 10 clicks

adsense_r4.map(lambda x: (x.fldDate, x.fldClicks)).sortBy(lambda x:x[1],False).take(10)

[('2011-11-26', 1044),

('2011-11-25', 1026),

('2011-11-28', 996),

('2011-11-29', 968),

('2011-11-18', 924),

('2011-11-16', 920),

('2011-11-17', 917),

('2011-11-12', 912),

('2011-08-06', 878),

('2011-11-13', 873)]

# no of clicks >= 800 - Descending order

adsense_r4.filter(lambda x: x.fldClicks >= 800).map(lambda x: (x.fldDate, x.fldClicks)).sortBy(lambda x: x[1],False).collect()

[('2011-11-26', 1044),

('2011-11-25', 1026),

('2011-11-28', 996),

('2011-11-29', 968),

('2011-11-18', 924),

('2011-11-16', 920),

('2011-11-17', 917),

('2011-11-12', 912),

('2011-08-06', 878),

('2011-11-13', 873),

('2011-11-30', 849),

('2011-07-23', 845),

('2011-12-02', 842),

('2011-11-15', 829),

('2011-11-27', 823),

('2011-07-09', 812),

('2011-07-08', 806),

('2011-07-04', 802),

('2011-11-19', 801),

('2011-11-11', 800)]

#no of clicks >= 800 - Ascending Order

adsense_r4.filter(lambda x: x.fldClicks >= 800).map(lambda x: (x.fldDate, x.fldClicks)).sortBy(lambda x: x[1],True).collect()

[('2011-11-11', 800),

('2011-11-19', 801),

('2011-07-04', 802),

('2011-07-08', 806),

('2011-07-09', 812),

('2011-11-27', 823),

('2011-11-15', 829),

('2011-12-02', 842),

('2011-07-23', 845),

('2011-11-30', 849),

('2011-11-13', 873),

('2011-08-06', 878),

('2011-11-12', 912),

('2011-11-17', 917),

('2011-11-16', 920),

('2011-11-18', 924),

('2011-11-29', 968),

('2011-11-28', 996),

('2011-11-25', 1026),

('2011-11-26', 1044)]

#How many days we got clicks >= 800

adsense_r4.filter(lambda x: x.fldClicks >= 800).count()

#Total number of clicks where individual day's click >= 800

adsense_r4.filter(lambda x: x.fldClicks >= 800).map(lambda x: x.fldClicks).reduce(lambda a,b:a+b)

17667

#Total number of clicks all the day

adsense_r4.map(lambda x: x.fldClicks).reduce(lambda m,n: m+n)

248190

#Total number of page views all the day

adsense_r4.map(lambda x: x.fldPageViews).reduce(lambda m,n: m+n)

18626239

#Total days when page views >= 50000

adsense_r4.filter(lambda x: x.fldPageViews >= 50000).count()

#Total days when page views >= 10000

adsense_r4.filter(lambda x: x.fldPageViews >= 10000).count()

649

Sankara's Big Data Notes

Friday, 29 May 2020

Google Adsense Data Analysis using Pyspark RDDs

No comments:

Post a Comment

Flume - Simple Demo