Sunday, 7 April 2019

Find 2nd Maximum Salary from employee dataframe in Spark with Scala

scala> dfEmp.orderBy(desc("salary")).show
+---+-------+------+------+------+
| id|   name|gender|salary|deptid|
+---+-------+------+------+------+
|106|Ayeesha|     f|  4000|    10|
|105|  Priya|     f|  3600|    12|
|104| Rashee|     f|  3500|    11|
|109|  Vinay|     m|  3200|    10|
|102| Suresh|     m|  3000|    12|
|108| Arushi|     f|  2800|    12|
|111| Shilpa|     f|  2600|    12|
|110|  Kalai|     f|  2550|    11|
|107|  Aruvi|     f|  2500|    11|
|101|   Rani|     f|  2000|    11|
|103|  Rahul|     m|  1250|    10|
|100|   Ravi|     m|  1000|    10|
+---+-------+------+------+------+


scala> dfEmp.where($"salary" < dfEmp.agg(max("salary")).first().getInt(0)).orderBy(desc("salary")).show(1)
+---+-----+------+------+------+
| id| name|gender|salary|deptid|
+---+-----+------+------+------+
|105|Priya|     f|  3600|    12|
+---+-----+------+------+------+
only showing top 1 row

// 2nd maximum salaried person
hive> select * from emp where salary not in (select max(salary) from emp ) order by salary desc limit 1;
105 Priya f 3600 12

hive> select * from (select * from emp sort by salary desc limit 2) result sort by salary limit 1;

105 Priya f 3600 12

scala> dfEmp.orderBy(desc("Salary")).limit(2).orderBy("salary").show(1);
+---+-----+------+------+------+
| id| name|gender|salary|deptid|
+---+-----+------+------+------+
|105|Priya|     f|  3600|    12|
+---+-----+------+------+------+
only showing top 1 row



scala> dfEmp.orderBy(desc("Salary")).take(2)
res87: Array[org.apache.spark.sql.Row] = Array([106,Ayeesha,f,4000,10], [105,Priya,f,3600,12])

scala> dfEmp.orderBy(desc("Salary")).take(2)(1);
res91: org.apache.spark.sql.Row = [105,Priya,f,3600,12]

Friday, 5 April 2019

Hive Queries Vs Dataframe Queries - Part 1

hadoop@hadoop:~/Desktop/vow$ touch emp.txt
hadoop@hadoop:~/Desktop/vow$ atom emp.txt

101,Sathya,1000
102,Shanthi,2000
103,Mani,3000
104,Kalai,4000
105,Aruvi,5000
106,Nila,1500
107,Praveen,2500
108,Rashee,7500
109,Pinki,3500
110,Ravi,2500

pwd : /home/hadoop/Desktop/vow


hive> create database learning;
OK
Time taken: 0.901 seconds

hive> use learning;
OK
Time taken: 0.08 seconds



hive> create external table emp(id int, name varchar(50), salary int) row format delimited fields terminated by ',';

hive> load data local inpath '/home/hadoop/Desktop/vow/emp.txt' into table emp;


hive> select * from emp;
OK
101 Sathya 1000
102 Shanthi 2000
103 Mani 3000
104 Kalai 4000
105 Aruvi 5000
106 Nila 1500
107 Praveen 2500
108 Rashee 7500
109 Pinki 3500
110 Ravi 2500
Time taken: 0.305 seconds, Fetched: 10 row(s)


scala> val empSchema = StructType(StructField("id",IntegerType,true)::StructField("name",StringType,true)::StructField("salary",IntegerType,true)::Nil)
empSchema: org.apache.spark.sql.types.StructType = StructType(StructField(id,IntegerType,true), StructField(name,StringType,true), StructField(salary,IntegerType,true))

scala> val df = spark.read.format("csv").option("header","false").schema(empSchema).load("/home/hadoop/Desktop/vow/emp.txt");
df: org.apache.spark.sql.DataFrame = [id: int, name: string ... 1 more field]

scala> df.printSchema
root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: integer (nullable = true)


hive> select * from emp;
OK
101 Sathya 1000
102 Shanthi 2000
103 Mani 3000
104 Kalai 4000
105 Aruvi 5000
106 Nila 1500
107 Praveen 2500
108 Rashee 7500
109 Pinki 3500
110 Ravi 2500

scala> df.show
+---+-------+------+
| id|   name|salary|
+---+-------+------+
|101| Sathya|  1000|
|102|Shanthi|  2000|
|103|   Mani|  3000|
|104|  Kalai|  4000|
|105|  Aruvi|  5000|
|106|   Nila|  1500|
|107|Praveen|  2500|
|108| Rashee|  7500|
|109|  Pinki|  3500|
|110|   Ravi|  2500|
+---+-------+------+


hive> select max(salary) from emp;
7500

scala> df.select(max(df("salary")) as "Salary").show
or
scala> df.select(max($"salary") as "Salary").show

+------+
|Salary|
+------+
|  7500|
+------+

hive> select max(salary),min(salary) from emp;
OK
7500 1000

scala> df.select(max(df("salary")) as "MaxSal", min(df("salary")) as "MinSal").show
df.select(max($"salary") as "MaxSal",min($"salary") as "MinSal").show
+------+------+
|MaxSal|MinSal|
+------+------+
|  7500|  1000|
+------+------+


hive> select salary from emp order by salary;
OK
1000
1500
2000
2500
2500
3000
3500
4000
5000
7500

scala> df.select(df("salary")).orderBy("salary").show
df.select($"salary").orderBy($"salary").show
+------+
|salary|
+------+
|  1000|
|  1500|
|  2000|
|  2500|
|  2500|
|  3000|
|  3500|
|  4000|
|  5000|
|  7500|
+------+

hive> select salary from emp order by salary desc;
OK
7500
5000
4000
3500
3000
2500
2500
2000
1500
1000

import org.apache.spark.sql.functions._

scala> df.select(df("salary")).orderBy(desc("salary")).show
or
scala> df.select($"salary").orderBy($"salary".desc).show
+------+
|salary|
+------+
|  7500|
|  5000|
|  4000|
|  3500|
|  3000|
|  2500|
|  2500|
|  2000|
|  1500|
|  1000|
+------+



hive> select sum(salary) from emp;
OK
32500


scala> df.select(sum("salary") as "Sum").show
or
scala> df.select(sum($"salary") as "Sum").show

+-----+
|  Sum|
+-----+
|32500|
+-----+



hadoop@hadoop:~$ touch emp.txt
hadoop@hadoop:~$ atom emp.txt

id,name,gender,salary,deptid
100,Ravi,m,1000,10
101,Rani,f,2000,11
102,Suresh,m,3000,12
103,Rahul,m,1250,10
104,Rashee,f,3500,11
105,Priya,f,3600,12
106,Ayeesha,f,4000,10
107,Aruvi,f,2500,11
108,Arushi,f,2800,12
109,Vinay,m,3200,10
110,Kalai,f,2550,11
111,Shilpa,f,2600,12

hadoop@hadoop:~$ atom dept.txt
hadoop@hadoop:~$ atom dept.txt

deptid,deptname
10,Marketing
11,Sales
12,Production

//  tblproperties("skip.header.line.count"="1");  --> which skips the header line

hive> create external table emp(id int, name varchar(50),gender char(1), salary int, deptid int) row format delimited fields terminated by ',' tblproperties("skip.header.line.count"="1");


hive> load data local inpath "/home/hadoop/Desktop/vow/emp.txt" into table emp;





hive> create external table dept(deptid int, deptname varchar(50)) row format delimited fields terminated by ',' tblproperties("skip.header.line.count"="1");

hive> load data local inpath "/home/hadoop/Desktop/vow/dept.txt" into table dept;




scala> val empSchema = StructType(StructField("id",IntegerType,true)::StructField("name",StringType,true)::StructField("gender",StringType,true)::StructField("salary",IntegerType,true)::StructField("deptid",IntegerType,true)::Nil)
empSchema: org.apache.spark.sql.types.StructType = StructType(StructField(id,IntegerType,true), StructField(name,StringType,true), StructField(gender,StringType,true), StructField(salary,IntegerType,true), StructField(deptid,IntegerType,true))

scala> scala> val deptSchema = StructType(StructField("deptid",IntegerType,true)::StructField("deptname",StringType,true)::Nil)
deptSchema: org.apache.spark.sql.types.StructType = StructType(StructField(deptid,IntegerType,true), StructField(deptname,StringType,true))


 val dfEmp  = spark.read.format("csv").option("header","true").schema(empSchema).load("/home/hadoop/Desktop/vow/emp.txt");

 val dfDept = spark.read.format("csv").option("header","true").schema(deptSchema).load("/home/hadoop/Desktop/vow/dept.txt");

 hive> select * from emp;
OK
100 Ravi m 1000 10
101 Rani f 2000 11
102 Suresh m 3000 12
103 Rahul m 1250 10
104 Rashee f 3500 11
105 Priya f 3600 12
106 Ayeesha f 4000 10
107 Aruvi f 2500 11
108 Arushi f 2800 12
109 Vinay m 3200 10
110 Kalai f 2550 11
111 Shilpa f 2600 12

 scala> dfEmp.show
 or
 scala> dfEmp.select("*").show

+---+-------+------+------+------+
| id|   name|gender|salary|deptid|
+---+-------+------+------+------+
|100|   Ravi|     m|  1000|    10|
|101|   Rani|     f|  2000|    11|
|102| Suresh|     m|  3000|    12|
|103|  Rahul|     m|  1250|    10|
|104| Rashee|     f|  3500|    11|
|105|  Priya|     f|  3600|    12|
|106|Ayeesha|     f|  4000|    10|
|107|  Aruvi|     f|  2500|    11|
|108| Arushi|     f|  2800|    12|
|109|  Vinay|     m|  3200|    10|
|110|  Kalai|     f|  2550|    11|
|111| Shilpa|     f|  2600|    12|
+---+-------+------+------+------+

hive> select * from dept;
OK
10 Marketing
11 Sales
12 Production
Time taken: 0.238 seconds, Fetched: 3 row(s)

scala> dfDept.show
or
scala> dfDept.select("*").show

+------+----------+
|deptid|  deptname|
+------+----------+
|    10| Marketing|
|    11|     Sales|
|    12|Production|
+------+----------+


scala> dfEmp.select(max($"salary") as "MaxSal").show
+------+
|MaxSal|
+------+
|  4000|
+------+

hive> select max(salary) from emp;
4000


scala> dfEmp.select(min($"salary") as "MaxSal").show
+------+
|MaxSal|
+------+
|  1000|
+------+

hive> select min(salary) from emp;
1000


hive> select max(salary) as MaxSal, min(salary) as MinSal from emp;
4000 1000

scala> dfEmp.select(max("salary") as "MaxSal",min("salary") as "MinSal").show
+------+------+
|MaxSal|MinSal|
+------+------+
|  4000|  1000|
+------+------+


hive> select deptid,max(salary) from emp group by deptid order by deptid;
10 4000
11 3500
12 3600



scala> dfEmp.groupBy("deptid").agg(max("salary") as "maxSal").orderBy("deptid").show
+------+------+                                                               
|deptid|maxSal|
+------+------+
|    10|  4000|
|    11|  3500|
|    12|  3600|
+------+------+


hive> select deptid,count(name) from emp group by deptid order by deptid;
10 4
11 4
12 4

scala> dfEmp.groupBy("deptid").agg(count("name") as "nameCount").orderBy("deptid").show
+------+---------+                                                           
|deptid|nameCount|
+------+---------+
|    10|        4|
|    11|        4|
|    12|        4|
+------+---------+




 scala> dfEmp.select($"salary").orderBy("salary").show
 or
 scala> dfEmp.select(dfEmp("salary")).orderBy("salary").show

+------+
|salary|
+------+
|  1000|
|  1250|
|  2000|
|  2500|
|  2550|
|  2600|
|  2800|
|  3000|
|  3200|
|  3500|
|  3600|
|  4000|
+------+

select salary from emp order by salary
1000
1250
2000
2500
2550
2600
2800
3000
3200
3500
3600
4000


scala> dfEmp.select($"salary").orderBy(desc("salary")).show
or
dfEmp.select("salary").orderBy(desc("salary")).show
+------+
|salary|
+------+
|  4000|
|  3600|
|  3500|
|  3200|
|  3000|
|  2800|
|  2600|
|  2550|
|  2500|
|  2000|
|  1250|
|  1000|
+------+

hive> select salary from emp order by salary desc;
4000
3600
3500
3200
3000
2800
2600
2550
2500
2000
1250
1000



hive> select gender,max(salary) from emp group by gender order by gender;
f 4000
m 3200

scala> dfEmp.groupBy("gender").agg(max("salary") as "maxSal").orderBy("gender").show
+------+------+                                                               
|gender|maxSal|
+------+------+
|     f|  4000|
|     m|  3200|
+------+------+


hive> select gender,sum(salary) from emp group by gender order by gender;
f 23550
m 8450

scala> dfEmp.groupBy("gender").agg(sum("salary") as "GenderSumSal").orderBy("gender").show
+------+------------+                                                         
|gender|GenderSumSal|
+------+------------+
|     f|       23550|
|     m|        8450|
+------+------------+





hive> select * from emp order by salary desc;
OK
106 Ayeesha f 4000 10
105 Priya f 3600 12
104 Rashee f 3500 11
109 Vinay m 3200 10
102 Suresh m 3000 12
108 Arushi f 2800 12
111 Shilpa f 2600 12
110 Kalai f 2550 11
107 Aruvi f 2500 11
101 Rani f 2000 11
103 Rahul m 1250 10
100 Ravi m 1000 10


scala> dfEmp.orderBy(desc("salary")).show
+---+-------+------+------+------+
| id|   name|gender|salary|deptid|
+---+-------+------+------+------+
|106|Ayeesha|     f|  4000|    10|
|105|  Priya|     f|  3600|    12|
|104| Rashee|     f|  3500|    11|
|109|  Vinay|     m|  3200|    10|
|102| Suresh|     m|  3000|    12|
|108| Arushi|     f|  2800|    12|
|111| Shilpa|     f|  2600|    12|
|110|  Kalai|     f|  2550|    11|
|107|  Aruvi|     f|  2500|    11|
|101|   Rani|     f|  2000|    11|
|103|  Rahul|     m|  1250|    10|
|100|   Ravi|     m|  1000|    10|
+---+-------+------+------+------+




hive> select * from emp order by salary desc limit 2;
OK
106 Ayeesha f 4000 10
105 Priya f 3600 12


scala> dfEmp.orderBy(desc("salary")).show(2);
+---+-------+------+------+------+
| id|   name|gender|salary|deptid|
+---+-------+------+------+------+
|106|Ayeesha|     f|  4000|    10|
|105|  Priya|     f|  3600|    12|
+---+-------+------+------+------+
only showing top 2 rows



// top salaried person
hive> select * from emp order by salary desc limit 1;
106 Ayeesha f 4000 10


scala> dfEmp.orderBy(desc("salary")).show(1);
+---+-------+------+------+------+
| id|   name|gender|salary|deptid|
+---+-------+------+------+------+
|106|Ayeesha|     f|  4000|    10|
+---+-------+------+------+------+
only showing top 1 row






//extract single value (scalar) from dataframe
scala> val x:Int = dfEmp.agg(max("salary")).head().getInt(0)
x: Int = 4000



scala> dfEmp.orderBy(desc("salary")).show
+---+-------+------+------+------+
| id|   name|gender|salary|deptid|
+---+-------+------+------+------+
|106|Ayeesha|     f|  4000|    10|
|105|  Priya|     f|  3600|    12|
|104| Rashee|     f|  3500|    11|
|109|  Vinay|     m|  3200|    10|
|102| Suresh|     m|  3000|    12|
|108| Arushi|     f|  2800|    12|
|111| Shilpa|     f|  2600|    12|
|110|  Kalai|     f|  2550|    11|
|107|  Aruvi|     f|  2500|    11|
|101|   Rani|     f|  2000|    11|
|103|  Rahul|     m|  1250|    10|
|100|   Ravi|     m|  1000|    10|
+---+-------+------+------+------+


scala> dfEmp.where($"salary" < dfEmp.agg(max("salary")).first().getInt(0)).orderBy(desc("salary")).show(1)
+---+-----+------+------+------+
| id| name|gender|salary|deptid|
+---+-----+------+------+------+
|105|Priya|     f|  3600|    12|
+---+-----+------+------+------+
only showing top 1 row

// 2nd maximum salaried person
hive> select * from emp where salary not in (select max(salary) from emp ) order by salary desc limit 1;
105 Priya f 3600 12

hive> select * from (select * from emp sort by salary desc limit 2) result sort by salary limit 1;

105 Priya f 3600 12

scala> dfEmp.orderBy(desc("Salary")).limit(2).orderBy("salary").show(1);
+---+-----+------+------+------+
| id| name|gender|salary|deptid|
+---+-----+------+------+------+
|105|Priya|     f|  3600|    12|
+---+-----+------+------+------+
only showing top 1 row



scala> dfEmp.orderBy(desc("Salary")).take(2)
res87: Array[org.apache.spark.sql.Row] = Array([106,Ayeesha,f,4000,10], [105,Priya,f,3600,12])

scala> dfEmp.orderBy(desc("Salary")).take(2)(1);
res91: org.apache.spark.sql.Row = [105,Priya,f,3600,12]

Thursday, 4 April 2019

Calculate the Square Root of Sum of Squares of Each numbers in a given file - using UDF which use Option..Some..None

Calculate the Square Root of Sum of Squares of Each numbers in a given file - using UDF which use Option..Some..None

// Excluded all characters from each line and find the square root of sum of squares of each numbers

$ cat charsAndNumbers.txt
1,a,b,c,2,3,4
2,3,4,x,y,z
s,t,u,5,2
m,n,8,10
5,2,1,a,x,y
7,a,x,2,6,h


scala> val r1 = sc.textFile("/home/hadoop/Desktop/vow/charsAndNumbers.txt")

// user defined function to extract only integers and exclude all characters
 def toInt(s:String):Option[Int] ={
   try{
Some(s.toInt)
   }
   catch {
case e: Exception => None
   }
       }
     

val r2 = r1.map(x => {
                       val fields = x.split(",")
   var s = 0
   for(f <- fields)
{
val currentNumber =  toInt(f).getOrElse(0) // calling UDF
if (currentNumber != 0){
s = s + (currentNumber * currentNumber)
}
}

s
                   })


scala> r2.collect
res1: Array[Int] = Array(30, 29, 29, 164, 30, 89)


scala> val result = r2.reduce(_+_)
result: Int = 371


scala> val finalResult = scala.math.sqrt(result)
finalResult: Double = 19.261360284258224

scala> scala.math.sqrt( (1*1) + (2*2) + (3*3) + (4*4)
     |  + (2*2) + (3*3) + (4*4)
     |  + (5*5) + (2*2)
     |  + (8*8) + (10*10)
     |  + (5*5) + (2*2) + (1*1)
     |  + (7*7) + (2*2) + (6*6))
res8: Double = 19.261360284258224



Find the square root of sum of squares of each numbers from a file using Spark with Scala

// Exclude all characters from each line and find the square root of sum of squares of each numbers


// given input file has character and numbers separated by comma
$ cat charsAndNumbers.txt
1,a,b,c,2,3,4
2,3,4,x,y,z
s,t,u,5,2
m,n,8,10
5,2,1,a,x,y
7,a,x,2,6,h


scala.math.sqrt( (1*1) + (2*2) + (3*3) + (4*4)
 + (2*2) + (3*3) + (4*4)
 + (5*5) + (2*2)
 + (8*8) + (10*10)
 + (5*5) + (2*2) + (1*1)
 + (7*7) + (2*2) + (6*6))

scala> val r1 = sc.textFile("/home/hadoop/Desktop/vow/charsAndNumbers.txt")

scala> r1.foreach(println)
1,a,b,c,2,3,4
2,3,4,x,y,z
s,t,u,5,2
m,n,8,10
5,2,1,a,x,y
7,a,x,2,6,h



val r2 = r1.map(x => {
                       val fields = x.split(",")
   var s = 0
   for(f <- fields)
{
  try
  {
s = s + (f.toInt * f.toInt)
  }
  catch
  {
case ex: Exception  => {
}
  }
}
s
                   })


scala> r2.collect
res1: Array[Int] = Array(30, 29, 29, 164, 30, 89)

scala> (1*1) + (2*2) + (3*3) + (4*4)
res2: Int = 30

scala> (2*2) + (3*3) + (4*4)
res3: Int = 29

scala> (5*5) + (2*2)
res4: Int = 29

scala> (8*8) + (10*10)
res5: Int = 164

scala> (5*5) + (2*2) + (1*1)
res6: Int = 30

scala> (7*7) + (2*2) + (6*6)
res7: Int = 89



scala> r2.foreach(println)
30
29
29
164
30
89


scala> val result = r2.reduce(_+_)
result: Int = 371


scala> val finalResult = scala.math.sqrt(result)
finalResult: Double = 19.261360284258224

scala> scala.math.sqrt( (1*1) + (2*2) + (3*3) + (4*4)
     |  + (2*2) + (3*3) + (4*4)
     |  + (5*5) + (2*2)
     |  + (8*8) + (10*10)
     |  + (5*5) + (2*2) + (1*1)
     |  + (7*7) + (2*2) + (6*6))
res8: Double = 19.261360284258224



Wednesday, 3 April 2019

Unix Shell Scripting Crash Course

$ touch hello.sh // create an empty file
$ atom hello.sh  // open atom editor
hello.sh:
--------
#!/usr/bin/env bash
#hello world sample script
echo Hello world!
echo Mars is red!

hadoop@hadoop:~/Desktop/vow/shellscript$ chmod 755 hello.sh

$ ./hello.sh

$ sh hello.sh
Hello world!
Mars is red!



$ touch greeting.sh

$ atom greeting.sh
#!/usr/bin/env bash
FIRST_NAME=Bob
FAVORITE_COLOR=blue
echo Hi $FIRST_NAME, your favorite color is $FAVORITE_COLOR

$ ./greeting.sh
Hi Bob, your favorite color is blue


greeting.sh:
-------------
#!/usr/bin/env bash
FIRST_NAME="Bob Roberts"
FAVORITE_COLOR=blue
echo Hi $FIRST_NAME, your favorite color is $FAVORITE_COLOR


$ ./greeting.sh
Hi Bob Roberts, your favorite color is blue

//Parameters
$0 - name of the script, the path is included

$1, $2, $3, ${10},
${255} - the last parameter


$ touch params.sh
$ atom params.sh

params.sh
------------
#!/usr/bin/env bash
echo File name is, $0
echo First Name : $1
echo Last Name : $2


$ ./params.sh Kapil Dev
File name is, ./params.sh
First Name : Kapil
Last Name : Dev



#!/usr/bin/env bash
FIRST_NAME=$1
LAST_NAME=$2
echo Hello, $FIRST_NAME, $LAST_NAME
echo $`date`
echo $`pwd`


$ bash params.sh sare ga
Hello, sare, ga
$Wed Apr 3 10:15:42 IST 2019
$/home/hadoop/Desktop/vow/shellscript



Challenge:
----------
Create a script named sport.sh
Make it executable
Accept 2 parameters : name and a favorite sport
Display any sentence to the console using those inputs


sport.sh:
----------
#!/usr/bin/env bash
NAME=$1
SPORT=$2
echo $NAME likes to watch $SPORT.

Execute the script:
-------------------
$ sh sport.sh Vijay Soccer
Vijay likes to watch Soccer.


If example:
-----------
#!/usr/bin/env bash
COLOR=$1
if [ $COLOR = "blue" ]
then
  echo "The color is blue"
fi

USER_GUESS=$2
COMPUTER=50

if [ $USER_GUESS -lt $COMPUTER ]
then
    echo "You are too Low"
fi


$ ./if.sh blue 40
The color is blue
You are too Low



#!/usr/bin/env bash
COLOR=$1
if [ $COLOR = "blue" ]
then
  echo "The color is blue"
else
  echo "The color is NOT blue"
fi

USER_GUESS=$2
COMPUTER=50

if [ $USER_GUESS -lt $COMPUTER ]
then
    echo "You are too Low"
else
    echo "You are equal or too high"
fi


$ ./if.sh Orange 55
The color is NOT blue
You are equal or too high



#!/usr/bin/env bash
USER_GUESS=$1
COMPUTER=50
if [ $USER_GUESS -lt $COMPUTER ]
then
  echo "You are too low"
elif [ $USER_GUESS -gt $COMPUTER ]
then
  echo "You are too High"
else
  echo "You have guessed it"
fi


$ ./if.sh 50
You have guessed it

$ ./if.sh 44
You are too low

$ ./if.sh 66
You are too High




whileexa.sh:
------------
#!/usr/bin/env bash
COUNT=0
while [ $COUNT -lt 10 ]
do
  echo "COUNT = $COUNT"
  ((COUNT++))
done

echo "While Loop Finished!".
exit 0


$ ./whileexa.sh
COUNT = 0
COUNT = 1
COUNT = 2
COUNT = 3
COUNT = 4
COUNT = 5
COUNT = 6
COUNT = 7
COUNT = 8
COUNT = 9
While Loop Finished!.


for.sh: (for each - multiple arguments)
-------
#!/usr/bin/env bash
NAMES=$@

for NAME in $NAMES
do
  echo "Hello $NAME"
done

echo "for loop terminated"



$ ./for.sh a b c d e f g h (n number of variable number of arguments)
Hello a
Hello b
Hello c
Hello d
Hello e
Hello f
Hello g
Hello h
for loop terminated


$ ./for.sh Awesome Outstanding Tremendous
Hello Awesome
Hello Outstanding
Hello Tremendous
for loop terminated



// break example
for.sh:
-------
#!/usr/bin/env bash
NAMES=$@

for NAME in $NAMES
do
  if [ $NAME = "Tracy" ]
  then
    break
  fi
  echo "Hello $NAME"
done

echo "for loop terminated"



$ ./for.sh Stacy Tracy Lacy
Hello Stacy
for loop terminated



Challenge:
-----------
Write a script named counter.sh
It should count from 1 to the number entered by the user
Through the loop, display the current count value
Once the loop terminates, display "Loop finished"

#!/usr/bin/env bash
COUNT=1
END=$1

while [ $COUNT -le $END ]
do
  echo "COUNT = $COUNT"
  ((COUNT++))
done

echo "Loop Finished."


$ ./counter.sh 5
COUNT = 1
COUNT = 2
COUNT = 3
COUNT = 4
COUNT = 5
Loop Finished.


$ ./counter.sh 7
COUNT = 1
COUNT = 2
COUNT = 3
COUNT = 4
COUNT = 5
COUNT = 6
COUNT = 7
Loop Finished.


//Environment variable example
vars.sh:
--------
#!/usr/bin/env bash
echo "The PATH is : $PATH"
echo "The terminal is : $TERM"
echo "The editor is : $EDITOR"

if [ -z $EDITOR ]
then
  echo "The EDITOR variable is not set"
fi

PATH="/bob"
echo "The PATH IS : $PATH"


output:
---------
$ ./vars.sh
The PATH is : /usr/local/hive/bin:/usr/local/hive/lib:/usr/local/spark/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin:/usr/local/java/bin:/usr/local/hadoop/bin:/usr/local/hadoop/sbin:/usr/local/kafka/bin
The terminal is : xterm-256color
The editor is :
The EDITOR variable is not set
The PATH IS : /bob

HOME - user's home directory
PATH - directories which are searched for commands
HOSTNAME - hostname of the machine
SHELL - shell thats being used
USER - user of this session
TERM - type of command-line terminal that is being used


// display environmental variables
$ echo $HOME
/home/hadoop

$ echo $PATH
/usr/local/hive/bin:/usr/local/hive/lib:/usr/local/spark/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin:/usr/local/java/bin:/usr/local/hadoop/bin:/usr/local/hadoop/sbin:/usr/local/kafka/bin

$ echo $HOSTNAME
hadoop

$ echo $SHELL
/bin/bash

$ echo $TERM
xterm-256color

$ echo $USER
hadoop

Challenge:
----------

Create a Script named : env.sh
Display a sentence
Include the computer name, user's name, home directory

env.sh:
--------
#!/usr/bin/env bash
echo "The computer's name is $HOSTNAME, the user's name is $USER, AND THE home directory is $HOME"
exit 0


$ ./env.sh
The computer's name is hadoop, the user's name is hadoop, AND THE home directory is /home/hadoop


Function Example:
-------------------
#!/usr/bin/env bash

function Hello(){
  echo "Hello!"
}

Goodbye(){
  echo "Goodbye!"
}

echo "Calling the Hello function"
Hello

echo "Calling the Goodbye function"
Goodbye
# Do not call like Goodbye()
exit 0
#define the function first, then call it later


$ ./func.sh
Calling the Hello function
Hello!
Calling the Goodbye function
Goodbye!


// Function with Parameters example
#!/usr/bin/env bash

function Hello(){
  local LNAME=$1
  echo "Hello! $LNAME"
}

Goodbye(){
  echo "Goodbye! $1"
}

echo "Calling the Hello function"
Hello Steve

echo "Calling the Goodbye function"
Goodbye David

exit 0



$ ./func.sh
Calling the Hello function
Hello! Steve
Calling the Goodbye function
Goodbye! David


//pipe operation example:
#!/usr/bin/env bash

FILES=`ls -1 | sort -r | head -3`
COUNT=1

for FILE in $FILES
do
  echo "FILE #$COUNT : $FILE"
  ((count++))
done

exit 0



output:
----------
$ ./pipe.sh
FILE #COUNT : whileexa.sh
FILE #COUNT : vars.sh
FILE #COUNT : sport.sh


Challenge:
----------

Create a script named pfunc.sh
Create two functions in the script:
a) GetFiles,
b) ShowFiles

GetFiles returns the first 10 files in the directory

pfunch.sh:
-----------
#!/usr/bin/env bash
function GetFiles(){
  FILES=`ls -1 | sort | head -10`
}

function ShowFiles(){
  local COUNT=1
  for FILE in $@
  do
    echo "FILE #$COUNT : $FILE"
    ((COUNT++))
  done
}

GetFiles
ShowFiles $FILES

exit 0

output:
-------
$ ./pfunch.sh
FILE #1 : counter.sh
FILE #2 : env.sh
FILE #3 : for.sh
FILE #4 : func.sh
FILE #5 : greeting.sh
FILE #6 : hello.sh
FILE #7 : if.sh
FILE #8 : params.sh
FILE #9 : pfunch.sh
FILE #10 : pipe.sh


File Example:
--------------

names.txt:
----------
Ravi
Kumar
Aradhana
Siva
Karthikeyan
Dhanush
Viswanathan
SelvaKumar
Ulagappan

fileReader.sh:
---------------
#!/usr/bin/env bash
COUNT=1

while IFS="" read -r LINE
do
  echo "LINE $COUNT : $LINE"
  ((COUNT++))
done < "$1"

exit 0


//IFS means Internal Field Separator \t \n ,
// execute the script and pass names.txt as input argument / parameter
$ ./fileReader.sh names.txt
LINE 1 : Ravi
LINE 2 : Kumar
LINE 3 : Aradhana
LINE 4 : Siva
LINE 5 : Karthikeyan
LINE 6 : Dhanush
LINE 7 : Viswanathan
LINE 8 : SelvaKumar
LINE 9 : Ulagappan


// create a file using redirection operators

 $ ./fileReader.sh names.txt > output1.txt  // overwrite / create
 $ ./fileReader.sh names.txt >> output1.txt // append
 $ cat output1.txt
LINE 1 : Ravi
LINE 2 : Kumar
LINE 3 : Aradhana
LINE 4 : Siva
LINE 5 : Karthikeyan
LINE 6 : Dhanush
LINE 7 : Viswanathan
LINE 8 : SelvaKumar
LINE 9 : Ulagappan
LINE 1 : Ravi
LINE 2 : Kumar
LINE 3 : Aradhana
LINE 4 : Siva
LINE 5 : Karthikeyan
LINE 6 : Dhanush
LINE 7 : Viswanathan
LINE 8 : SelvaKumar
LINE 9 : Ulagappan



// cksum of a file
//whether the file is tampered or not

hadoop@hadoop:~/Desktop/vow/shellscript$ cat  names.txt
Ravi
Kumar
Aradhana
Siva
Karthikeyan
Dhanush
Viswanathan
SelvaKumar
Ulagappan

hadoop@hadoop:~/Desktop/vow/shellscript$ cksum names.txt  // original
4016456064 78 names.txt  // 3 parameters checksum value, filesize,filename

hadoop@hadoop:~/Desktop/vow/shellscript$ atom names.txt
Ravi
Kumar
Aradhana
Siva
Karthikeyan
Dhanush
Viswanaathan // added one extra 'a'
SelvaKumar
Ulagappan

hadoop@hadoop:~/Desktop/vow/shellscript$ cksum names.txt
783674190 79 names.txt  // because of file tampered, the cksum value changed


hadoop@hadoop:~/Desktop/vow/shellscript$ atom names.txt
Ravi
Kumar
Aradhana
Siva
Karthikeyan
Dhanush
Viswanathan // removed extra 'a'
SelvaKumar
Ulagappan

hadoop@hadoop:~/Desktop/vow/shellscript$ cksum names.txt
4016456064 78 names.txt  // file is not tampered - i mean transported without any corruption, or modification

4016456064 -- value should be sent via e-mail. that number should match before and after tranfer


Challenge:
----------
Create a script named read3.sh
Have it read a file name passed as a parameter
It should only display the first 3 lines, with count

read3.sh:
--------
#!/usr/bin/env bash
COUNT=1

while IFS="" read -r LINE
do
  echo "LINE $COUNT : $LINE"
  if [ $COUNT -ge 3 ]
  then
    break
  fi
  ((COUNT++))
done < "$1"

exit 0

output:
--------
$ ./read3.sh names.txt
LINE 1 : Ravi
LINE 2 : Kumar
LINE 3 : Aradhana


delayexa.sh:
-------------
#!/usr/bin/env bash
DELAY=$1
if [ -z $DELAY ]
then
  echo "You must supply a delay"
  exit 1
fi

echo "Going to sleep for $DELAY seconds"
sleep $DELAY
echo "We are awake now."
exit 0



// put & at the end.
$ ./delayexa.sh 5 &
[1] 5833 // process id of the script running in background
hadoop@hadoop:~/Desktop/vow/shellscript$ Going to sleep for 5 seconds
We are awake now.

[1]+  Done                    ./delayexa.sh 5


output:
-------
$ ./delayexa.sh 5
Going to sleep for 5 seconds
We are awake now.




proc.sh:
---------

#!/usr/bin/env bash
STATUS=0

if [ -z $1 ]
then
  echo "Please supply a PID"
  exit 1
fi

echo "Watching PID = $1"

while [ $STATUS -eq 0 ]
do
  ps $1 > /dev/null
  STATUS=$?
done
echo "Process $1 has terminated"
exit 0


// open a new terminal window, and start nano editor
$ nano a


$ ps -a
  PID TTY          TIME CMD
 5971 pts/1    00:00:00 nano // see the process id for nano is 5971
 5973 pts/0    00:00:00 ps


// start the shell script
$ ./proc.sh 5971
Watching PID = 5971

// stop nano
Process 5971 has terminated



// run the delayexa.sh script
$ ./delayexa.sh 20 &
[1] 8468
hadoop@hadoop:~/Desktop/vow/shellscript$ Going to sleep for 20 seconds
We are awake now.

// pass the process id of delay program
$ ./proc.sh 8468
Watching PID = 8468
Process 8468 has terminated



//Get input from user
prompt.sh:
----------
#!/usr/bin/env bash
read -p "What is your first name? " NAME
echo "Your name is : $NAME"
exit 0


output:
-------
$ ./prompt.sh
What is your first name? sarega
Your name is : sarega




//validating user input in shell scripting:
user.sh:
---------
#!/usr/bin/env bash
VALID=0

while [ $VALID -eq 0 ]
do
  read -p "Please enter your name and age : " NAME  AGE
  if [[ ( -z $NAME ) || ( -z $AGE ) ]]
  then
    echo "Not enough parameters passed"
    continue
  elif [[ ! $NAME =~ ^[A-Za-z]+$ ]]
  then
    echo "Non Alpha characters detected [ $NAME ]"
    continue
  elif [[ ! $AGE =~ ^[0-9]+$ ]]
  then
    echo "Non digit characters detected [ $AGE ]"
    continue
  fi
  VALID=1
done
echo "Name = $NAME and Age = $AGE"
exit 0



output:
---------
$ ./user.sh
Please enter your name and age : SARA 40
Name = SARA and Age = 40
hadoop@hadoop:~/Desktop/vow/shellscript$ ./user.sh
Please enter your name and age :
Not enough parameters passed
Please enter your name and age : SA
Not enough parameters passed
Please enter your name and age : SA 3
Name = SA and Age = 3


//Challenge
Create a script named guess.sh
Set a global variable named COMPUTER to a number between 1 and 50
Take input from the user
if the user's input matches COMPUTER, they won



guess.sh:
---------
#!/usr/bin/env bash
COMPUTER=50
PLAYING=0
while [ $PLAYING -eq 0 ]
do
  read -p "What's your guess : " INPUT
  if [ $INPUT -eq $COMPUTER ]
  then
    echo "You've won, the number was $COMPUTER"
    exit 0
  elif [ $INPUT -lt $COMPUTER ]
  then
    echo "You're too low"
  fi
done

exit 0




$ ./guess.sh
What's your guess : 3
You're too low
What's your guess : 5
You're too low
What's your guess : 50
You've won, the number was 50

Flume - Simple Demo

// create a folder in hdfs : $ hdfs dfs -mkdir /user/flumeExa // Create a shell script which generates : Hadoop in real world <n>...