Saturday, 30 May 2020

Call log Regular Expressions in Pyspark

import re

in1 = 'ec59cea2-5006-448f-a031-d5e53f33be232014-03-15 00:02:332014-03-15 06:03:42DROPPED 80526900577757919463'
in2 = 'ec59cea2-5006-448f-a047-d5e53f33be232014-03-19 00:03:482014-03-19 05:02:33DROPPED 57554548979797979797'
in3 = 'ec59cea2-5006-448f-a039-d5e53f33be232014-03-17 00:04:452014-03-17 04:06:05FAILED  44554584848449644469'
in4 = 'ec59cea2-5006-448f-a045-d5e53f33be232014-03-19 00:05:482014-03-19 03:02:34SUCCESS 84645645605646064646'
in5 = 'ec59cea2-5006-448f-a050-d5e53f33be232014-03-20 00:06:282014-03-20 01:06:05SUCCESS 74894086489489489489'
inputs = list((in1,in2,in3,in4,in5))


def extract(str):
    pnos = re.search('\d{20}',str).group()
    fromno = pnos[0:10]
    tono = pnos[10:20]
    #status = re.search('[A-Z]{6,7}',str).group()
    
    status=re.search('SUCCESS|DROPPED|FAILED',str).group()
    timestamps = re.findall('(\d{4}-\d{2}-\d{2}\s{1}\d{2}:\d{2}:\d{2})',str)
    starttime = timestamps[0]
    endtime=timestamps[1]
    return (fromno,tono,status,starttime,endtime)


for logg in inputs:
    print(extract(logg))

No comments:

Post a Comment

Flume - Simple Demo

// create a folder in hdfs : $ hdfs dfs -mkdir /user/flumeExa // Create a shell script which generates : Hadoop in real world <n>...