d1 = spark.sparkContext.parallelize( [('a',(20,30,40,50)), ('b',(1,2,3)) ])
print(d1.count()) #2
print(d1.flatMapValues(lambda x:x).collect())
[('a', 20), ('a', 30), ('a', 40), ('a', 50), ('b', 1), ('b', 2), ('b', 3)]
print(d1.map(lambda x:( x[0], len(x[1]))).collect())
[('a', 4), ('b', 3)]
print(d1.mapValues(lambda x:len(x)).collect())
[('a', 4), ('b', 3)]
keyBy Example:
d1 = spark.sparkContext.parallelize([(101,'Vijay','BTech'),(102,'Balaji','Bsc'), (103,'Arun')] )
print(d1.collect())
#[(101, 'Vijay', 'BTech'), (102, 'Balaji', 'Bsc'), (103, 'Arun')]
d2 = d1.keyBy(lambda x:x[0])
print(d2.collect())
#[(101, (101, 'Vijay', 'BTech')), (102, (102, 'Balaji', 'Bsc')), (103, (103, 'Arun'))]
No comments:
Post a Comment