Showing posts with label regex. Show all posts
Showing posts with label regex. Show all posts

Sunday, 20 January 2019

Web Log Analysis with Pattern Matching using Regex in Spark with Scala

input file: (weblog.txt)
------------
3.94.78.5 - 69827    [15/Sep/2013:23:58:36 +0100] "GET /KBDOC-00033.html HTTP/1.0"
19.33.140.62 - 21475 [15/Sep/2013:23:58:34 +0100] "GET /KBDOC-00033.html HTTP/1.0"
19.31.140.62 - 2489 [15/Sep/2013:23:58:34 +0100] "GET /KBDOC-00033.html HTTP/1.0"
3.91.78.5 - 69827    [15/Sep/2013:23:58:36 +0100] "GET /KBDOC-00033.html HTTP/1.0"
19.32.140.62 - 2489 [15/Sep/2013:23:58:34 +0100] "GET /KBDOC-00033.html HTTP/1.0"
19.35.140.62 - 2489 [15/Sep/2013:23:58:34 +0100] "GET /KBDOC-00033.html HTTP/1.0"
3.93.78.5 - 69827    [15/Sep/2013:23:58:36 +0100] "GET /KBDOC-00033.html HTTP/1.0"
12.38.140.62 - 2489 [15/Sep/2013:23:58:34 +0100] "GET /KBDOC-00033.html HTTP/1.0"
12.38.140.62 - 4712 [15/Sep/2013:23:58:34 +0100] "GET /KBDOC-00033.html HTTP/1.0"
13.94.78.5 - 69827    [15/Sep/2013:23:58:36 +0100] "GET /KBDOC-00033.html HTTP/1.0"
11.38.140.62 - 4712 [15/Sep/2013:23:58:34 +0100] "GET /KBDOC-00033.html HTTP/1.0"
12.38.140.62 - 4712 [15/Sep/2013:23:58:34 +0100] "GET /KBDOC-00033.html HTTP/1.0"

scala> val r1 = sc.textFile("E:\\POCs\\weblog.txt")
r1: org.apache.spark.rdd.RDD[String] = E:\POCs\weblog.txt MapPartitionsRDD[8] at textFile at <console>:24

scala> r1.count
res9: Long = 12

scala> val ip_pattern = "[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}".r
ip_pattern: scala.util.matching.Regex = [0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}

scala> val port_pattern = " - [0-9]{1,5} ".r
port_pattern: scala.util.matching.Regex =  - [0-9]{1,5}

scala> val r2 = r1.map { x =>
     | val ip = ip_pattern.findFirstIn(x).get
     | val port = port_pattern.findFirstIn(x).get
     | (ip,port)
     | }


scala> r2.count
res10: Long = 12

scala> r2.take(10).foreach(println)
(3.94.78.5, - 69827 )
(19.33.140.62, - 21475 )
(19.31.140.62, - 2489 )
(3.91.78.5, - 69827 )
(19.32.140.62, - 2489 )
(19.35.140.62, - 2489 )
(3.93.78.5, - 69827 )
(12.38.140.62, - 2489 )
(12.38.140.62, - 4712 )
(13.94.78.5, - 69827 )


scala> val r3 = r2.map (x => (x._1, x._2.slice(2,x._2.length-1).trim().toInt))
r3: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[13] at map at <console>:25

scala> r3.foreach(println)
(3.93.78.5,69827)
(12.38.140.62,2489)
(12.38.140.62,4712)
(13.94.78.5,69827)
(11.38.140.62,4712)
(3.94.78.5,69827)
(19.33.140.62,21475)
(19.31.140.62,2489)
(3.91.78.5,69827)
(19.32.140.62,2489)
(19.35.140.62,2489)
(12.38.140.62,4712)

scala> val df = r3.toDF("ip","port")
df: org.apache.spark.sql.DataFrame = [ip: string, port: int]















scala> df.printSchema
root
 |-- ip: string (nullable = true)
 |-- port: integer (nullable = false)

Find DROPPED call records using Spark with Scala using Regular Expressions REGEX

scala> val rd1 = sc.textFile("E:\\POCs\\calllogdata.txt")
rd1: org.apache.spark.rdd.RDD[String] = E:\POCs\calllogdata.txt MapPartitionsRDD[1] at textFile at <console>:24

scala> val drop_rdd = rd1.filter ( x=> x.contains ("DROPPED"))
drop_rdd: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[2] at filter at <console>:25

scala> val pattern_status = "[A-Z]{6,7}".r
pattern_status: scala.util.matching.Regex = [A-Z]{6,7}

scala> val pattern_phnos = "[0-9]{20}".r
pattern_phnos: scala.util.matching.Regex = [0-9]{20}


scala> val result_rdd = drop_rdd.map { x =>
       val status = pattern_status.findFirstIn(x).get
       val pnos = pattern_phnos.findFirstIn(x).get
       val ph1 = pnos.slice(0,10)
       val ph2 = pnos.slice(10,20)
       (status,ph1,ph2)
       }
result_rdd: org.apache.spark.rdd.RDD[(String, String, String)] = MapPartitionsRDD[3] at map at <console>:29


scala> result_rdd.collect.foreach(println)
(DROPPED,8052690057,7757919463)
(DROPPED,9886177375,9916790556)
(DROPPED,9876515616,4894949494)
(DROPPED,8055645645,8478787877)
(DROPPED,8080905609,5676236992)
(DROPPED,9609065215,8087080806)
(DROPPED,8979794646,5879874615)
(DROPPED,7458456456,4564564656)
(DROPPED,9879489494,5648947898)
(DROPPED,7854545645,6456456456)
(DROPPED,5755454897,9797979797)

Saturday, 19 January 2019

Call Log Data Analysis using Spark with Scala with Regex

Input Data in a file : colllogdata.txt
--------------------------------------
ec59cea2-5006-448f-a031-d5e53f33be232014-03-15 00:02:482014-03-15 00:06:05DROPPED 80526900577757919463
ec59cea2-5006-448f-a032-d5e53f33be232014-03-15 00:02:482014-03-15 00:06:07DROPPED 98861773759916790556
ec59cea2-5006-448f-a033-d5e53f33be232014-03-16 00:02:482014-03-16 00:06:45SUCCESS 86186279969886177375
ec59cea2-5006-448f-a034-d5e53f33be232014-03-16 00:02:482014-03-16 00:06:53DROPPED 98765156164894949494
ec59cea2-5006-448f-a035-d5e53f33be232014-03-16 00:02:482014-03-16 00:06:12FAILED  54545454546469496477
ec59cea2-5006-448f-a036-d5e53f33be232014-03-16 00:02:482014-03-16 00:06:05SUCCESS 12354678902153698431
ec59cea2-5006-448f-a037-d5e53f33be232014-03-17 00:02:482014-03-17 00:06:05DROPPED 80556456458478787877
ec59cea2-5006-448f-a038-d5e53f33be232014-03-17 00:02:482014-03-17 00:06:05DROPPED 80809056095676236992
ec59cea2-5006-448f-a039-d5e53f33be232014-03-17 00:02:482014-03-17 00:06:05FAILED  44554584848449644469
ec59cea2-5006-448f-a040-d5e53f33be232014-03-18 00:02:482014-03-18 00:06:05DROPPED 96090652158087080806
ec59cea2-5006-448f-a041-d5e53f33be232014-03-18 00:02:482014-03-18 00:06:05DROPPED 89797946465879874615
ec59cea2-5006-448f-a042-d5e53f33be232014-03-18 00:02:482014-03-18 00:06:05SUCCESS 45454545457978978979
ec59cea2-5006-448f-a043-d5e53f33be232014-03-18 00:02:482014-03-18 00:06:05DROPPED 74584564564564564656
ec59cea2-5006-448f-a044-d5e53f33be232014-03-19 00:02:482014-03-19 00:06:05DROPPED 98794894945648947898
ec59cea2-5006-448f-a045-d5e53f33be232014-03-19 00:02:482014-03-19 00:06:05SUCCESS 84645645605646064646
ec59cea2-5006-448f-a046-d5e53f33be232014-03-19 00:02:482014-03-19 00:06:05DROPPED 78545456456456456456
ec59cea2-5006-448f-a047-d5e53f33be232014-03-19 00:02:482014-03-19 00:06:05DROPPED 57554548979797979797
ec59cea2-5006-448f-a048-d5e53f33be232014-03-20 00:02:482014-03-20 00:06:05SUCCESS 87898640989489089409
ec59cea2-5006-448f-a049-d5e53f33be232014-03-20 00:02:482014-03-20 00:06:05SUCCESS 75884848478978978979
ec59cea2-5006-448f-a050-d5e53f33be232014-03-20 00:02:482014-03-20 00:06:05SUCCESS 74894086489489489489

scala> val rd1 = sc.textFile("E:\\POCs\\calllogdata.txt")
rd1: org.apache.spark.rdd.RDD[String] = E:\POCs\calllogdata.txt MapPartitionsRDD[1] at textFile at <console>:24

// regular expression to extract status from the line
scala> val pattern_status = "[A-Z]{6,7}".r
pattern_status: scala.util.matching.Regex = [A-Z]{6,7}

// regular expression to extract  2 different phone numbers from the line
scala> val pattern_phnos = "[0-9]{20}".r
pattern_phnos: scala.util.matching.Regex = [0-9]{20}

scala> val result_rdd = rd1.map( x => {
     | val status = pattern_status.findFirstIn(x).get
     | val pnos = pattern_phnos.findFirstIn(x).get
     | val phno1 = pnos.slice(0,10)
     | val phno2 = pnos.slice(10,20)
     | (status,phno1,phno2)
     | })
result_rdd: org.apache.spark.rdd.RDD[(String, String, String)] = MapPartitionsRDD[2] at map at <console>:33


scala> result_rdd.collect.foreach(println)
(DROPPED,8052690057,7757919463)
(DROPPED,9886177375,9916790556)
(SUCCESS,8618627996,9886177375)
(DROPPED,9876515616,4894949494)
(FAILED,5454545454,6469496477)
(SUCCESS,1235467890,2153698431)
(DROPPED,8055645645,8478787877)
(DROPPED,8080905609,5676236992)
(FAILED,4455458484,8449644469)
(DROPPED,9609065215,8087080806)
(DROPPED,8979794646,5879874615)
(SUCCESS,4545454545,7978978979)
(DROPPED,7458456456,4564564656)
(DROPPED,9879489494,5648947898)
(SUCCESS,8464564560,5646064646)
(DROPPED,7854545645,6456456456)
(DROPPED,5755454897,9797979797)
(SUCCESS,8789864098,9489089409)
(SUCCESS,7588484847,8978978979)
(SUCCESS,7489408648,9489489489)


scala> val rdd_dropped = result_rdd.map ( x => {
     | val status = x._1 == "DROPPED"
     | val ph1 = x._2
     | val ph2 = x._3
     | (status,ph1,ph2)
     | })
rdd_dropped: org.apache.spark.rdd.RDD[(Boolean, String, String)] = MapPartitionsRDD[3] at map at <console>:31

scala> rdd_dropped.collect.foreach(println)
(true,8052690057,7757919463)
(true,9886177375,9916790556)
(false,8618627996,9886177375)
(true,9876515616,4894949494)
(false,5454545454,6469496477)
(false,1235467890,2153698431)
(true,8055645645,8478787877)
(true,8080905609,5676236992)
(false,4455458484,8449644469)
(true,9609065215,8087080806)
(true,8979794646,5879874615)
(false,4545454545,7978978979)
(true,7458456456,4564564656)
(true,9879489494,5648947898)
(false,8464564560,5646064646)
(true,7854545645,6456456456)
(true,5755454897,9797979797)
(false,8789864098,9489089409)
(false,7588484847,8978978979)
(false,7489408648,9489489489)


scala> val rdd_dropped = result_rdd.map ( x => {
       val status = x._1 == "DROPPED"
       val ph1 = x._2
       val ph2 = x._3
   if (status == true) { (status,ph1,ph2) }
       })
 
scala> rdd_dropped.collect.foreach(println)
(true,8052690057,7757919463)
(true,9886177375,9916790556)
()
(true,9876515616,4894949494)
()
()
(true,8055645645,8478787877)
(true,8080905609,5676236992)
()
(true,9609065215,8087080806)
(true,8979794646,5879874615)
()
(true,7458456456,4564564656)
(true,9879489494,5648947898)
()
(true,7854545645,6456456456)
(true,5755454897,9797979797)
()
()
()

Data Extraction using Regular Expression Pattern Matching in Scala

Here is the sample code to do with Regex pattern matching data extraction technique

// Input text
scala> val line = "ec59cea2-5006-448f-a031-d5e53f33be232014-03-15 00:02:482014-03-15 00:06:05DROPPED 80526900577757919463"
line: String = ec59cea2-5006-448f-a031-d5e53f33be232014-03-15 00:02:482014-03-15 00:06:05DROPPED 80526900577757919463

// regular expression to extract status from the line
// status : DROPPED, SUCCESS, FAILED (A-Z :::6 or 7 characters)
scala> val pattern_status = "[A-Z]{6,7}".r
pattern_status: scala.util.matching.Regex = [A-Z]{6,7}

// extract status from the given line of text
scala> val status = pattern_status.findFirstIn(line).get
status: String = DROPPED

// regular expression to extract  2 different phone numbers from the line
//phnos : Continuous 20 digits
scala> val pattern_phnos = "[0-9]{20}".r
pattern_phnos: scala.util.matching.Regex = [0-9]{20}

scala> val pnos = pattern_phnos.findFirstIn(line).get
pnos: String = 80526900577757919463

// get first 10 digits
scala> val ph1 = pnos.slice(0,10)
ph1: String = 8052690057

// get last 10 digits
scala> val ph2 = pnos.slice(10,20)
ph2: String = 7757919463

// make a tuple
scala> (status,ph1,ph2)
res0: (String, String, String) = (DROPPED,8052690057,7757919463)

Flume - Simple Demo

// create a folder in hdfs : $ hdfs dfs -mkdir /user/flumeExa // Create a shell script which generates : Hadoop in real world <n>...