Repartition vs coalesce
Repartition - shuffling infolved, less performance
- increase or decrease the number of partitions allowed
coalesce - increasing the partition is not allowed
- we can only decrease the no of partition
e1 = spark.sparkContext.textFile("E:\\DataSets\\olympix_data.csv")
print(e1.getNumPartitions()) #2
e2 = e1.repartition(5)
print(e2.getNumPartitions()) #5
e3 = e1.repartition(2)
print(e3.getNumPartitions()) #2
c1 = e1.coalesce(2)
print(c1.getNumPartitions()) #2
c2 = e1.coalesce(10)
print(c2.getNumPartitions()) #2
2
5
2
2
2
e1 = spark.sparkContext.textFile("E:\\DataSets\\olympix_data.csv")
football = e1.repartition(4).filter(lambda x: 'Football' in x)
wrestling = e1.repartition(5).filter(lambda x: 'Wrestling' in x)
weightlifting = e1.repartition(6).filter(lambda x: 'Weightlifting' in x)
print(e1.getNumPartitions()) #2
print(football.getNumPartitions()) #4
print(wrestling.getNumPartitions()) #5
print(weightlifting.getNumPartitions()) #6
2
4
5
6
#Repartition with Union
unionResult = football.union(wrestling).union(weightlifting)
print(unionResult.getNumPartitions()) #15
#Repartition with Intersection
intersectionResult = football.intersection(wrestling).intersection(weightlifting)
print(intersectionResult.getNumPartitions()) #15
e1 = spark.sparkContext.textFile("E:\\DataSets\\olympix_data.csv")
football = e1.repartition(4).filter(lambda x: 'Football' in x)
wrestling = e1.repartition(5).filter(lambda x: 'Wrestling' in x)
weightlifting = e1.repartition(6).filter(lambda x: 'Weightlifting' in x)
subt = e1.subtract(football)
print(subt.getNumPartitions()) #6
cart = football.cartesian(wrestling)
print(cart.getNumPartitions()) #20
No comments:
Post a Comment