Frequency of word in text file
customer.txt:
-------------
1,Richard,Hernandez,XXXXXXXXX,XXXXXXXXX,6303 Heather Plaza,Brownsville,TX,78521
2,Mary,Barrett,XXXXXXXXX,XXXXXXXXX,9526 Noble Embers Ridge,Littleton,CO,80126
3,Ann,Smith,XXXXXXXXX,XXXXXXXXX,3422 Blue Pioneer Bend,Caguas,PR,00725
4,Mary,Jones,XXXXXXXXX,XXXXXXXXX,8324 Little Common,San Marcos,CA,92069
5,Robert,Hudson,XXXXXXXXX,XXXXXXXXX,10 Crystal River Mall ,Caguas,PR,00725
6,Mary,Smith,XXXXXXXXX,XXXXXXXXX,3151 Sleepy Quail Promenade,Passaic,NJ,07055
7,Melissa,Wilcox,XXXXXXXXX,XXXXXXXXX,9453 High Concession,Caguas,PR,00725
8,Megan,Smith,XXXXXXXXX,XXXXXXXXX,3047 Foggy Forest Plaza,Lawrence,MA,01841
9,Mary,Perez,XXXXXXXXX,XXXXXXXXX,3616 Quaking Street,Caguas,PR,00725
10,Melissa,Smith,XXXXXXXXX,XXXXXXXXX,8598 Harvest Beacon Plaza,Stafford,VA,22554
val custRDD = sc.textFile("D:\\Ex\\customer.txt")
val words = custRDD.flatMap(x => x.split(",")).filter(x => x != "XXXXXXXXX")
val pairRDD = words.map(x => (x,1))
val reducedRDD = pairRDD.reduceByKey(_+_)
val sortedRDD = reducedRDD.sortBy(x => x._2, false)
sortedRDD.take(10).foreach(println)
scala> val custRDD = sc.textFile("D:\\Ex\\customer.txt")
custRDD: org.apache.spark.rdd.RDD[String] = D:\Ex\customer.txt MapPartitionsRDD[6] at textFile at <console>:24
scala> val words = custRDD.flatMap(x => x.split(",")).filter(x => x != "XXXXXXXXX")
words: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[8] at filter at <console>:25
scala> val pairRDD = words.map(x => (x,1))
pairRDD: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[9] at map at <console>:25
scala> val reducedRDD = pairRDD.reduceByKey(_+_)
reducedRDD: org.apache.spark.rdd.RDD[(String, Int)] = ShuffledRDD[10] at reduceByKey at <console>:25
scala> val sortedRDD = reducedRDD.sortBy(x => x._2, false)
sortedRDD: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[15] at sortBy at <console>:25
scala> sortedRDD.take(10).foreach(println)
(Mary,1196)
(PR,1186)
(Smith,1160)
(Caguas,1152)
(00725,1152)
(CA,505)
(TX,183)
(NY,168)
(IL,123)
(FL,73)
No comments:
Post a Comment