import collections
from datetime import datetime
Adsense = collections.namedtuple('Adsense',['fldDate','fldPageViews','fldImpressions','fldClicks','fldPageRPM','fldImpressionRPM','fldActiveViewViewable','fldEstimatedEarnings'])
def parseAdsense(_row):
fields = _row.split(",")
_date = fields[0]
_pageViews = (int) (fields[1])
_impressions = (int) (fields[2])
_clicks = (int) (fields[3])
_pageRPM = fields[4]
_impressionRPM = fields[5]
_activeviewviewable = fields[6]
_estimatedEarnings = (float) (fields[7])
_adsense = Adsense(_date,_pageViews,_impressions,_clicks,_pageRPM,_impressionRPM,_activeviewviewable,_estimatedEarnings)
return _adsense
#print(parseMovie("1::Toy Story (1995)::Animation|Children's|Comedy"))
adsense_r1 = spark.sparkContext.textFile("E:\\DataSets\\adsense_data.csv")
#print(adsense_r1.count())
#for j in adsense_r1.collect():
# print(j)
headerInfo = adsense_r1.first()
#print(headerInfo)
adsense_r2 = adsense_r1.filter(lambda x: headerInfo not in x)
#print(adsense_2.first())
#print(adsense_2.count())
#for j in adsense_r2.collect():
# print(j)
adsense_r3 = adsense_r2.map(lambda x:(str)(x)).map(parseAdsense)
print(adsense_r3.count())
adsense_r4 = adsense_r3.filter(lambda x: x.fldEstimatedEarnings != 0)
print(adsense_r4.count())
2452
1511
#Find the total Earnings
totalEarnings = adsense_r4.map(lambda x: x.fldEstimatedEarnings).reduce(lambda x,y : x+y)
print(totalEarnings) #16995.739999999994
#Find the top 10 Earnings without Date
adsense_r4.map(lambda x: x.fldEstimatedEarnings).sortBy(lambda x: x, False).take(10)
[63.71, 54.14, 53.77, 52.73, 52.14, 50.86, 49.63, 48.91, 48.8, 48.58]
#Find the top 10 Earnings with Date
adsense_r4.map(lambda x: (x.fldDate, x.fldEstimatedEarnings)).sortBy(lambda x: x[1], False).take(10)
[('2010-02-25', 63.71),
('2011-11-28', 54.14),
('2010-12-18', 53.77),
('2011-11-21', 52.73),
('2010-12-19', 52.14),
('2011-07-13', 50.86),
('2011-12-02', 49.63),
('2011-06-24', 48.91),
('2011-09-27', 48.8),
('2011-07-04', 48.58)]
#Find the least 10 Earnings without Date
adsense_r4.map(lambda x: x.fldEstimatedEarnings).sortBy(lambda x: x, True).take(10)
[0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01]
#Find the least 10 Earnings with Date
adsense_r4.map(lambda x: (x.fldDate, x.fldEstimatedEarnings)).sortBy(lambda x: x[1], True).take(10)
[('2009-02-09', 0.01),
('2009-02-17', 0.01),
('2011-12-10', 0.01),
('2009-03-07', 0.01),
('2011-12-13', 0.01),
('2008-09-11', 0.01),
('2008-09-08', 0.01),
('2008-09-03', 0.01),
('2008-09-05', 0.01),
('2011-12-06', 0.01)]
#in which dates we got Estimated Earnings >= 40
adsense_r4.filter(lambda x: x.fldEstimatedEarnings >= 40).map(lambda x: (x.fldDate, x.fldEstimatedEarnings)).sortBy(lambda x: x[1], False).collect()
[('2010-02-25', 63.71),
('2011-11-28', 54.14),
('2010-12-18', 53.77),
('2011-11-21', 52.73),
('2010-12-19', 52.14),
('2011-07-13', 50.86),
('2011-12-02', 49.63),
('2011-06-24', 48.91),
('2011-09-27', 48.8),
('2011-07-04', 48.58),
('2011-06-23', 47.93),
('2011-07-12', 46.85),
('2011-11-30', 46.69),
('2011-07-08', 46.45),
('2010-12-14', 45.68),
('2011-06-26', 45.5),
('2011-10-19', 45.22),
('2011-06-30', 45.02),
('2011-06-22', 44.82),
('2011-06-25', 44.58),
('2011-07-16', 44.56),
('2011-07-09', 44.45),
('2011-11-23', 44.23),
('2011-07-27', 44.0),
('2011-11-22', 43.78),
('2011-09-21', 43.59),
('2011-11-24', 43.48),
('2011-07-11', 43.33),
('2011-11-29', 43.27),
('2011-07-05', 43.08),
('2011-09-18', 43.02),
('2011-11-20', 42.98),
('2011-09-13', 42.29),
('2011-09-17', 42.28),
('2011-08-09', 42.17),
('2011-10-27', 42.08),
('2011-09-20', 41.96),
('2011-07-23', 41.6),
('2011-09-23', 41.5),
('2011-11-15', 41.43),
('2011-10-18', 41.4),
('2011-07-14', 41.36),
('2011-07-10', 41.21),
('2011-11-17', 41.15),
('2011-09-19', 40.69),
('2011-11-26', 40.49),
('2011-08-05', 40.05)]
How many days we got Estimated Earnings >= 40?
adsense_r4.filter(lambda x: x.fldEstimatedEarnings >= 40).count()
47
>10 and <= 50
adsense_r4.filter(lambda x: x.fldEstimatedEarnings >= 10 and x.fldEstimatedEarnings <= 50).map(lambda x: (x.fldDate, x.fldEstimatedEarnings)).sortBy(lambda x: x[1], False).collect()
>10 and <= 50 - count
adsense_r4.filter(lambda x: x.fldEstimatedEarnings >= 10 and x.fldEstimatedEarnings <= 50).count()
583
#top 10 page views
adsense_r4.map(lambda x: (x.fldDate, x.fldPageViews)).sortBy(lambda x:x[1],False).take(10)
[('2010-02-25', 61358),
('2011-07-03', 60066),
('2011-04-03', 59079),
('2011-07-04', 55221),
('2011-11-21', 50808),
('2011-07-08', 49545),
('2011-11-20', 49020),
('2011-11-13', 48613),
('2011-11-28', 48360),
('2011-11-22', 47939)]
#top 10 clicks
adsense_r4.map(lambda x: (x.fldDate, x.fldClicks)).sortBy(lambda x:x[1],False).take(10)
[('2011-11-26', 1044),
('2011-11-25', 1026),
('2011-11-28', 996),
('2011-11-29', 968),
('2011-11-18', 924),
('2011-11-16', 920),
('2011-11-17', 917),
('2011-11-12', 912),
('2011-08-06', 878),
('2011-11-13', 873)]
# no of clicks >= 800 - Descending order
adsense_r4.filter(lambda x: x.fldClicks >= 800).map(lambda x: (x.fldDate, x.fldClicks)).sortBy(lambda x: x[1],False).collect()
[('2011-11-26', 1044),
('2011-11-25', 1026),
('2011-11-28', 996),
('2011-11-29', 968),
('2011-11-18', 924),
('2011-11-16', 920),
('2011-11-17', 917),
('2011-11-12', 912),
('2011-08-06', 878),
('2011-11-13', 873),
('2011-11-30', 849),
('2011-07-23', 845),
('2011-12-02', 842),
('2011-11-15', 829),
('2011-11-27', 823),
('2011-07-09', 812),
('2011-07-08', 806),
('2011-07-04', 802),
('2011-11-19', 801),
('2011-11-11', 800)]
#no of clicks >= 800 - Ascending Order
adsense_r4.filter(lambda x: x.fldClicks >= 800).map(lambda x: (x.fldDate, x.fldClicks)).sortBy(lambda x: x[1],True).collect()
[('2011-11-11', 800),
('2011-11-19', 801),
('2011-07-04', 802),
('2011-07-08', 806),
('2011-07-09', 812),
('2011-11-27', 823),
('2011-11-15', 829),
('2011-12-02', 842),
('2011-07-23', 845),
('2011-11-30', 849),
('2011-11-13', 873),
('2011-08-06', 878),
('2011-11-12', 912),
('2011-11-17', 917),
('2011-11-16', 920),
('2011-11-18', 924),
('2011-11-29', 968),
('2011-11-28', 996),
('2011-11-25', 1026),
('2011-11-26', 1044)]
#How many days we got clicks >= 800
adsense_r4.filter(lambda x: x.fldClicks >= 800).count()
20
#Total number of clicks where individual day's click >= 800
adsense_r4.filter(lambda x: x.fldClicks >= 800).map(lambda x: x.fldClicks).reduce(lambda a,b:a+b)
17667
#Total number of clicks all the day
adsense_r4.map(lambda x: x.fldClicks).reduce(lambda m,n: m+n)
248190
#Total number of page views all the day
adsense_r4.map(lambda x: x.fldPageViews).reduce(lambda m,n: m+n)
18626239
#Total days when page views >= 50000
adsense_r4.filter(lambda x: x.fldPageViews >= 50000).count()
5
#Total days when page views >= 10000
adsense_r4.filter(lambda x: x.fldPageViews >= 10000).count()
649
No comments:
Post a Comment