ctrl + l ===> clear
REPL (Read Eval Print Loop)
Interactive Interpreter
spark-shell -- to start spark shell
var a_int:Int = 3
var a_char:Char ='d'
var a_long:Long =3234234234L
var b =3
var x =3434l
var z =234234.0f
Collections :
Sequences:
Vectors, Streams, Lists, Queues, Strings, Stacks
Sets:
HashSet, SortedSet, TreeSet, BitSet, ListSet
Maps:
HashMaps, SortedMaps, TreeMaps, ListMaps
Mutable can change
Immutable cannot change
Array:
Indexed collection of values
Mutable collection types
val temps = Array(50,51,56,53,40)
temps(0)
temps.length
val t1:Array[Int] = new Array[Int](10)
t1: Array[Int] = Array(0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
scala> val t3 = Array.ofDim[Int](5,5)
t3: Array[Array[Int]] = Array(Array(0, 0, 0, 0, 0), Array(0, 0, 0, 0, 0), Array(0, 0, 0, 0, 0), Array(0, 0, 0, 0, 0), Array(0, 0, 0, 0, 0))
Import Library:
import Array._
- (underscore is equalent to * asterisk)
scala> concat(t1,t1)
res14: Array[Int] = Array(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
Name of the Array variable press TAB to list all the intellisense kind of
Array. TAB -- to display all the functions, properties (intellisense) associated with that
scala> val vec1:Vector[Int]=Vector(1,2,3,4,5,6,7,8,9,10)
vec1: Vector[Int] = Vector(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
scala> vec1(1)
res15: Int = 2
Range:
Data structure which representing integer values
Default step value is 1
val myRange = 1 to 10 //// (step value is 1)
myRange: scala.collection.immutable.Range.Inclusive = Range(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
val myRange2:Range = new Range(1,101,2)
Maps:
Scala collection used to key,value pair
var capitals = Map("India"->"Delhi", "Pakistan"->"Lahore","SriLanka"->"Colombo")
capitals
scala> capitals.keys
res17: Iterable[String] = Set(India, Pakistan, SriLanka)
scala> capitals.values
res18: Iterable[String] = MapLike(Delhi, Lahore, Colombo)
scala> capitals get ("India")
res20: Option[String] = Some(Delhi)
scala> capitals("India")
res24: String = Delhi
scala> capitals getOrElse("Vietnam","Not Found")
res22: String = Not Found
capitals+= ("TamilNadu"->"Chennai")
scala> capitals
res35: scala.collection.immutable.Map[String,String] = Map(India -> Delhi, Pakistan -> Lahore, SriLanka -> Colombo, TamilNadu -> Chennai)
capitals-="SriLanka"
scala> capitals
res37: scala.collection.immutable.Map[String,String] = Map(India -> Delhi, Pakistan -> Lahore, TamilNadu -> Chennai)
scala> var a =10
a: Int = 10
scala> var b = 20
b: Int = 20
scala> var c = a+b
c: Int = 30
scala> print (c)
30
scala> 10%4
res39: Int = 2
scala> 3 > 4
res40: Boolean = false
Single expression using blocks { }
scala> println({
| val a = 2*3
| a+4
| })
Functions:
----------
def myFunction(a:Int, b:Int) : Int = {
val c = a*b
return c
}
myFunction(5,2)
scala> def myFunction(a:Int, b:Int) : Int = {
| return a*b
| }
myFunction: (a: Int, b: Int)Int
scala> myFunction(5,2)
res42: Int = 10
The above are Functions with return value.
Function without return (Procedures - with side effects -- logs, printing messages)
void ==> Unit
def myProcedure(inStr:String) : Unit = {
println(inStr)
}
myProcedure("Hai!")
scala> def myProcedure(inStr:String) : Unit ={
| println(inStr)
| }
myProcedure: (inStr: String)Unit
scala> myProcedure("Sarega")
Sarega
scala> val y =Array("England","India","SriLanka","Pakistan")
y: Array[String] = Array(England, India, SriLanka, Pakistan)
scala> y
res44: Array[String] = Array(England, India, SriLanka, Pakistan)
scala> y.sorted
res45: Array[String] = Array(England, India, Pakistan, SriLanka)
Class : Creating object
-----------------------
scala> class Person(var name:String, var city:String, var zip:String)
defined class Person
scala> var p = new Person("Ravi","Pallathur","630107")
p: Person = $iwC$$iwC$Person@3a945ccf
scala> p
res47: Person = $iwC$$iwC$Person@3a945ccf :: Encrypted
scala> p.name
res48: String = Ravi
scala> p.city
res49: String = Pallathur
scala> p.zip
res50: String = 630107
scala> class Point2D(coord1:Int, coord2:Int){
| var a:Int = coord1
| var b:Int = coord2
| def move(deltaA:Int, deltaB:Int){
| a+=deltaA
| b+=deltaB
| }
| }
defined class Point2D
scala> val p1 = new Point2D(10,20)
p1: Point2D = $iwC$$iwC$Point2D@10c6b580
scala> p1.a
res52: Int = 10
scala> p1.b
res53: Int = 20
scala> p1.move(50,100)
scala> p1.a
res55: Int = 60
scala> p1.b
res56: Int = 120
Parallel Collection:
convert a sequential collection into a parallel collection
create a variable with a parallel collection
Range of 100 integers: (Sequence)
val rng100 = 1 to 100
scala.collection.immutable.Range.Inclusive
val prng100 = rng100.par (Parallel) // converting sequential range into Parallel Range
scala.collection.parallel.immutable.ParRange
prng100. TAB
ctrl + L
import scal.collection.parallel.immutable.ParVector (Name space using //)
val pec200 = ParVector.range(0,200)
val prng100 = rng100.par.filter(_ > 80)
val v = (1 to 100).toArray
val pv = v.par
Multiply each member of v array by 2 using map
v.map(_ * 2) ---> sequential
v.map(_ * 2) ---> Parallel
Map collection is a group of key,value pair
Map method is a functional programming construct which allows to apply a function for each member in a collection
Function in Parallel execution:
def SquareIt(x:Int) : Int = { return x * x }
SquareIt(4) ==> 16
v.map(SquareIt(_))
pv.map(SquareIt(_))
Filter collection:
val v = (1 to 10000).toArray
val pv = v.par
v.length
pv.length
How to get value > 5000
val pvf = pv.filter(_ > 5000)
pvf = 5000
FilterNot == Filter Negation
val pvf2 = pv.filterNot(_ > 5000) /// Not operator while doing filter
Function which returns Boolean:
------------------------------
def div3 (x:Int) :Boolean = {val y:Int=(x % 3); return (y==0)}
div3(3) ==> true
scala> def div3(x:Int) : Boolean = {val y:Int = (x % 3); return (y == 0) }
div3: (x: Int)Boolean
scala> div3(3)
res66: Boolean = true
scala> pv.filter(div3(_))
res67: scala.collection.parallel.mutable.ParArray[Int] = ParArray(3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48, 51, 54, 57, 60, 63, 66, 69, 72, 75, 78, 81, 84, 87, 90, 93, 96, 99)
pv.filterNot(div3(_))
Scala REPL: (Read Eval Print Loop)
spark-shell
object HelloWorld {
def main(args: Array[String]) {
println("Who is going to come with me?")
}
}
Intro to Spark:
---------------
Functional programming and also a object oriented language
Distributed processing framework : spark
Spark is written in Scala
Spark - Advantages:
Fast Processing of libraries for analytics
Stream Processing
Fault Tolerant
Scalable
Cloud Environment : Easy to add clusters, nodes
spark-shell : Start Spark
scala>
Parallel collection to Distributed Processing capability
RDDs:
-----
Immutable distributed collection
Organized into logical partitions
Fault-tolerant collection
May keep data in memory or persisted
RDDs like Parallel Collections
------------------------------
Groups of data of the same type or structure
Data Processed in parallel
Faster than working with sequential operations
RDDs unlike Parallel Collections
------------------------------
partitioned by a hash function
distributed across multiple servers
can be easily persisted to permanent storage
Create a Range:
---------------
import scala.util.Random
var bigRng = scala.util.Random.shuffle(1 to 100000)
Convert Range into RDD:
-------------------------
val bigPRng = sc.parallelize(bigRng)
bigPRng.mean
bigPRng.min
bigPRng.max
Mapping Functions over RDDs
---------------------------
import scala.util.Random
val bigRng = scala.util.Random.shuffle(1 to 100000) // collection of Random numbers
val bigPRng = sc.parallelize(bigRng)
bigPRng.take(25)
bigPRng2 = bigPRng.map(_ * 2)
val bigPRng2 = bigPRng.map (_ * 2)
bigPRng2.take(25)
bigPRng2.mean
bigPRng2.min
bigPRng2.max
UDF:
scala> def div(x:Int):Boolean = {val y:Int=(x%3); return (y==0)}
div: (x: Int)Boolean
scala> div(3)
res11: Boolean = true
scala> div(4)
res12: Boolean = false
val bigBool = bigPRng2.map(div(_))
bigBool.take(25)
res13: Array[Boolean] = Array(false, false, true, false, true, false, false, true, true, false, false, false, false, false, true, true, true, true, false, false, false, true, false, true, false)
res13.take(5)
not working**********************
[cloudera@quickstart Downloads]$ wget http://www.gutenberg.org/cache/epub/1497/pg1497.txt
hdfs dfs -copyFromLocal pg1497.txt thisOne
hdfs dfs -cat thisOne/pg1497.txt
hdfs://quickstart.cloudera:8020/home/cloudera/thisOne/pg1497.txt
/home/cloudera/Downloads/pg1497.txt
thisOne/pg1497.txt
/root/freshproducts.csv
not working**********************
val republic = sc.textFile("/Users/danielsullivan/Downloads/pg1497.txt")
republic.take(25).foreach(println)
val linesWithSocrates = republic.filter(line => line.contains("Socrates"))
linesWithSocrates.take(10).foreach(println)
Statistics
Descriptive - understand the shape of data
Hypothesis, make predictions
import scala.util.Random
import org.apache.spark.mllib.stat.Statistics
val bigRng = scala.util.Random.shuffle(1 to 100000)
val bigPRng = sc.parallelize(bigRng)
val bigPRng2 = bigPRng.map (_ * 2)
bigPRng2.take(25).foreach(println)
bigPRng2.mean
bigPRng2.min
bigPRng2.max
bigPRng2.stats
val sample = bigPRng2.takeSample(true,100)
val sample = bigPRng2.takeSample(true,100)
val sample = bigPRng2.takeSample(true,100)
Take sample: subset which is randomly collected from our collection
val series1 = Array.fill(100000)(Random.nextDouble)
val series2 = Array.fill(100000)(Random.nextDouble)
val Pseries1 = sc.parallelize(series1)
val Pseries2 = sc.parallelize(series2)
val myCorrelation:Double = Statistics.corr(Pseries1,Pseries2,"pearson")
val distTest = Statistics.kolmogorovSmirnovTest(Pseries1,"norm",0,1)
Data Frames:
Kind of like a relational tables, datastructure which are organized as rows, named columns
import org.apache.spark.sql.SparkSession
Data Frame #1: (Employee table)
-------------- (Employee)
val spark = SparkSession.builder().appName("DataFrameExercise").getOrCreate()
val df_emps = spark.read.option("header","true").csv("file:///C:/Spark/file.csv.csv")
read 10 records:
df_emps.take(10)
to see the structure:
df_emps.schema
display records in tabular view:
df_emps.show()
Data Frame #2: (Country Region)
--------------
val df_cr = spark.read.option("header","true").csv("country_region.txt")
df_cr.take(10)
df_cr.show()
df_cr.columns
Data Frame #3: (Department Division)
------------------------------------
val df_dd = spark.read.option("header","true").csv("/home/cloudera/Downloads/dept_div.txt")
df_dd.show()
df_emps.show()
Create a temporary view: SQL in Spark
------------------------
def_emps.createOrReplaceTempView("employees") // it will allow us to use SQL
val sqldf_emps = spark.sql("SELECT * from employees")
val sqldf_emps_by_dept = spark.sql("SELECT department, count(*) FROM employees GROUP BY department")
sqldf_emps_by_dept.show()
val sqldf_emps_by_dept_gender = spark.sql("SELECT department,gender,count(*) FROM employees GROUP BY department,gender)
sqldf_emps_by_dept.gender.show()
val sqldf_depts = spark.sql("SELECT DISTINCT department FROM employees")
sqldf_depts.show()
Filtering: Display employees whose id less than 100
-----------
val sqldf_emps_100 = spark.sql("SELECT * FROM employees WHERE id < 100)
sqldf_emps_100.show()
JOINING Table using Data Frames:
--------------------------------
df_emps.show() // region_id is here
df_cr.show() // regions_id is here too
val df_joined = df_emps.join(df_cr,"region_id")
df_joined.columns --> combined columns of both data frames
df_joined.show() --> employee + region info combined together
Working with JSON:
-----------------
import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder.appName("DataFrameExercise").getOrCreate
val df_json_dd = spark.read.json("/home/cloudera/Downloads/dept_div.json")
df_json_dd.show()
Spark
Spark Vs MapReduce
Spark RDDs
Spark DataFrames
Used to quickly and easily handle big data
Apache Top level open source project
Created at AMPLab at UC Berkeley
Flexible alternative to MapReduce
Spark can use data stored in variety of formats like Cassandra, AWS S3, HDFS, ...
MapReduce requires files to be stored in HDFS, Spark Doesn't
Can Perform operations up to 100X faster than MapReduce
MapReduce write most data to disk after each map and reduce operations
Spark keeps most of the data in memory after each transformation
Spark can spill over to disk if the memory is filled
Core of Spark : Resilient Distributed DataSet (RDDs)
Features of RDDs:
Distributed Collection of Data
Fault-tolerant
Parallel operation - partitioned
Ability to use many data sources
Driver Program (SparkContxt)
Cluster Manager
Worker Node:
Executor (Task), Cache
Worker Node:
Executor (Task(s)),Cache
Master (Job)
---> Task @ Slave Node
<--- Result Slave (Data cached in RAM / Disk)
---> Task @ Slave Node
<--- Result Slave (Data cached in RAM / Disk)
---> Task @ Slave Node
<--- Result Slave (Data cached in RAM / Disk)
2 Types of operations:
Transformation : Recipies to follow
Actions : perform what the recipe says to do and returns something back
RDDs are immutable, lazily evaluated, cacheable
Transformations, Actions
RDD Vs DataFrame syntax
Spark is moving towards a DataFrame based syntax.
Distributed files can still be though of as RDDs
Cleaner syntax to work with Data : DataFrame
Data Types in Scala:
--------------------
Int
Double
String
Boolean
var myVar:Int = 10
var myVal:Double = 2.5
val myString = "Hello" // val myString:String = "Hello"
Interpolation:
--------------
inserting an object into string
scala> var name=10
name: Int = 10
scala> println("hello $name")
hello $name
scala> println(f"hello $name")
hello 10
scala> println(s"hello $name")
hello 10
scala> "dance"+5
res34: String = dance5
scala> "dance"*5
res35: String = dancedancedancedancedance
var st:String="Hello"
st.length or st.length()
val name="Jose"
val greets = s"Hello ${name}"
scala> var name:String="Jose"
name: String = Jose
scala> val greet = s"Hello ${name}"
greet: String = Hello Jose
scala> greet
res1: String = Hello Jose
scala> println(greet)
Hello Jose
scala> val greet = s"hello $name"
greet: String = hello Jose
scala> printf("A string %s, an integer %d, a float %f","Hi",10,12.345)
A string Hi, an integer 10, a float 12.345000
scala> val st="This is a long string"
st: String = This is a long string
scala> st.charAt(0)
res5: Char = T
scala> st.indexOf("a")
res6: Int = 8
scala> st slice(0,4)
res8: String = This
scala> st indexOf("a")
res9: Int = 8
scala> val st ="Alagappa"
st: String = Alagappa
scala> st matches "Alagappa"
res10: Boolean = true
scala> st.matches("Alagappa")
res11: Boolean = true
scala> st.contains("gapp")
res12: Boolean = true
Tuples:
Ordered sequence of values in multiple data types
scala> (1,2,2,"hai")
res13: (Int, Int, Int, String) = (1,2,2,hai)
scala> val myTuple =(1,2,"hello",23.2,true)
myTuple: (Int, Int, String, Double, Boolean) = (1,2,hello,23.2,true)
scala> (3,1,(2,3))
res14: (Int, Int, (Int, Int)) = (3,1,(2,3))
scala> myTuple._3
res15: String = hello
scala> myTuple._5
res16: Boolean = true
Ordered sequence of values that can hold values of multiple data types
Scala collections:
Lists
Sets
Maps
Arrays
scala> val evens = List(2,4,6,8,10)
evens: List[Int] = List(2, 4, 6, 8, 10)
scala> evens(0)
res22: Int = 2
scala> evens(4)
res23: Int = 10
scala> var exa = List(1,2.0,true,List(1,2,3))
exa: List[Any] = List(1, 2.0, true, List(1, 2, 3))
scala> var e = List(List(1,2,3),List(2,3,4),List(3,4,5))
e: List[List[Int]] = List(List(1, 2, 3), List(2, 3, 4), List(3, 4, 5))
scala> var ee = List(List("sa",2,3.0),List("sare",3,3.5))
ee: List[List[Any]] = List(List(sa, 2, 3.0), List(sare, 3, 3.5))
scala> var ee = List(List("sa",2,3.0),List("sare",33.3,3.5))
ee: List[List[Any]] = List(List(sa, 2, 3.0), List(sare, 33.3, 3.5))
scala> var ee = List(List("sa",2,3.0),List("sare",33.3,3.5),List(true,false,false))
ee: List[List[Any]] = List(List(sa, 2, 3.0), List(sare, 33.3, 3.5), List(true, false, false))
scala> var m = List(1,9,0,-1,3.5,2)
m: List[Double] = List(1.0, 9.0, 0.0, -1.0, 3.5, 2.0)
scala> m.sorted
res24: List[Double] = List(-1.0, 0.0, 1.0, 2.0, 3.5, 9.
cala> m.max
res25: Double = 9.0
scala> m.min
res26: Double = -1.0
scala> m.sum
res27: Double = 14.5
cala> m.max
res25: Double = 9.0
scala> m.min
res26: Double = -1.0
scala> m.sum
res27: Double = 14.5
val x = List(1,2,3,4)
x.drop(2)
x.takeRight(1)
val x = List(1,2,3,4,5,6,7,8)
x.slice(0,3)
scala> List(1,2,3,4,5,6,7,8).slice(0,3)
res0: List[Int] = List(1, 2, 3)
scala> List(1,2,3,4,5,6,7,8).slice(0,3).slice(0,1)
res1: List[Int] = List(1)
scala> var x = List(1,2,3,4,5,6,7,8)
x: List[Int] = List(1, 2, 3, 4, 5, 6, 7, 8)
scala> x slice (0,2)
res2: List[Int] = List(1, 2)
scala> val arr = Array(1,2,3)
arr: Array[Int] = Array(1, 2, 3)
scala> val arr1 = Array("a","b","c")
arr1: Array[String] = Array(a, b, c)
scala> val arr2 = Array(1,2.2,true,"hai")
arr2: Array[Any] = Array(1, 2.2, true, hai)
Generate sequence using Range:
-----------------------------
val a = Array.range(0,1000)
val b = Array.range(0,1000,55) // with step value
Range(0,5)
// scala.collection.immutable.Range
Set:
Set is a Scala collection which allows Unique elements, no duplicate elements
Immutable and Mutable Sets
scala> val duplicate = Set(1,1,1,3,3,2,2,2,2,-1,-1,9)
duplicate: scala.collection.immutable.Set[Int] = Set(1, 9, 2, 3, -1)
scala> val s = collection.immutable.Set(1,2,3)
s: scala.collection.immutable.Set[Int] = Set(1, 2, 3)
scala> s
res4: scala.collection.immutable.Set[Int] = Set(1, 2, 3)
scala> s += 4
<console>:28: error: value += is not a member of scala.collection.immutable.Set[Int]
s += 4
^
scala> val s = collection.mutable.Set(1,2,3)
s: scala.collection.mutable.Set[Int] = Set(1, 2, 3)
scala> s+4
res6: scala.collection.mutable.Set[Int] = Set(1, 2, 3, 4)
scala> s.add(234234)
res15: Boolean = true
scala> s
res16: scala.collection.mutable.Set[Int] = Set(1, 234, 2, 3, 234234)
scala> s.min
res17: Int = 1
scala> s.max
res18: Int = 234234
scala> val myList = List(1,2,3,1,2,3)
myList: List[Int] = List(1, 2, 3, 1, 2, 3)
scala> val myNewSet = myList.toSet()
myNewSet: Boolean = false
Converting list with duplicate values into New Set which returns unique elements
scala> val myNewSet = myList.toSet // Typecasting
myNewSet: scala.collection.immutable.Set[Int] = Set(1, 2, 3)
Map:
----
Key,value pairs storage
HashTable, dictionary
scala> val myMap = Map(("Murugesan","Designer"),("Sankar",".Net"),("Sudha","Lit"))
myMap: scala.collection.immutable.Map[String,String] = Map(Murugesan -> Designer, Sankar -> .Net, Sudha -> Lit)
scala> myMap.keys
res19: Iterable[String] = Set(Murugesan, Sankar, Sudha)
scala> myMap.values
res20: Iterable[String] = MapLike(Designer, .Net, Lit)
scala> myMap
res21: scala.collection.immutable.Map[String,String] = Map(Murugesan -> Designer, Sankar -> .Net, Sudha -> Lit)
scala> myMap("Murugesan")
res23: String = Designer
scala> myMap("Sudha")
res24: String = Lit
while doing lookup instead of passing integer we pass keys
scala> myMap.get("sankar")
res26: Option[String] = None
scala> myMap.get("Sankar")
res27: Option[String] = Some(.Net)
scala> myMap.getOrElse("Sara","Nothing found")
res28: String = Nothing found
scala> val myMutableMap = collection.mutable.Map("Lakshmi"->"HR")
myMutableMap: scala.collection.mutable.Map[String,String] = Map(Lakshmi -> HR)
scala> myMutableMap += ("Priyanka"-> "HR")
res32: myMutableMap.type = Map(Priyanka -> HR, Lakshmi -> HR)
scala> var myList = List(1,2,3,4,5)
myList: List[Int] = List(1, 2, 3, 4, 5)
scala> myList.contains(3)
res33: Boolean = true
Add all the elements:
myList.sum
Array of Odd numbers from 0 to 15
scala> var myOddArray = Array.range(0,15,2)
myOddArray: Array[Int] = Array(0, 2, 4, 6, 8, 10, 12, 14)
scala> var myOddArray = Array.range(0,15,2)
myOddArray: Array[Int] = Array(0, 2, 4, 6, 8, 10, 12, 14)
scala> Range(1,15,2)
res34: scala.collection.immutable.Range = Range(1, 3, 5, 7, 9, 11, 13)
scala> Array.range(1,15,2)
res35: Array[Int] = Array(1, 3, 5, 7, 9, 11, 13)
scala> Array.range(1,16,2)
res37: Array[Int] = Array(1, 3, 5, 7, 9, 11, 13, 15)
scala> val myList = List(1,2,1,1,2,3,4,4,5,6,6)
myList: List[Int] = List(1, 2, 1, 1, 2, 3, 4, 4, 5, 6, 6)
scala> val mySet = myList.toSet
mySet: scala.collection.immutable.Set[Int] = Set(5, 1, 6, 2, 3, 4)
Control flow statements:
-----------------------
// comments
if (boolean)
{
}
else if (boolean)
{
}
else
{
}
No semicolans needed at the end.
save the file with file name as : scala_programming.scala
if (true)
{
println("I will print if True")
}
run it in spark-shell
:load scala_programming.scala
val x:String = "Hello"
if (x.endsWith("o"))
{
println("The value of x ends with o")
}
val x:String = "Zzzzzz"
if (x.endsWith("o"))
{
println("The value of x ends with o")
}
else
{
println("The value of x doesnt end with o")
}
val person:String = "George"
if (person == "Sammy")
{
println("Welcome Sammy")
}
else if (person =="George")
{
println ("Welcome George!")
}
else
{
println("What is your name?")
}
Logical Operators:
AND &&
println( (1 ==2) && (2 == 2)) // false
println( (1 ==1 ) && (2 == 2)) // true
OR ||
println( (1 ==1 ) || (2 == 2)) // true
NOT ! inequivality checking
println(! (1==1)) // false
FOR:
for(item <- iterable_sequence)
{
do something
}
exa:
for (item <- List(1,2,3))
{
println("Hello")
}
Hello
Hello
Hello
for (item <- List(1,2,3))
{
println(item)
}
1
2
3
for (num <- Array.range(0,5,1))
{
println(num)
}
0
1
2
3
4
for (num <- Set(1,2,3)
{
println(num)
}
1
2
3
for( num <- Range(0,10))
{
if (num % 2 == 0)
{
println(s"$num is even")
}
else
{
println(s"$num is odd")
}
}
val names = List("John","Abe","Cindy",Cat")
for (name <- names)
{
if (name.startsWith("C"))
{
println(s"$name starts with a C")
}
}
while:
var x = 0
while (x < 5)
{
println(s"x is currently $x")
println("x is still less than 5, adding 1 to x")
x = x + 1
}
Scala doesnt have builtin break functionality.
We have to import it from utility
import util.control.Breaks._ /// we import break from this namespace
var y = 0
while (y < 10)
{
println (s"y is currently $y")
println (y is stil less than 10, add 1 to y")
y = y+1
if (y==3) break
}
Functions in Scala:
-------------------
Methods Vs Functions
We will use the term "function" to describe the idea of having a piece of code that
can take in parameters, perform an operation, and produce an output.
Easily reuse and call code segments
def simple() : Unit =
{
println ("simple print")
}
run it : simple()
Unit is nothing but void in other languages - it doesnt return anything
with parameters:
def adder(num1:Int, num2:Int) : Int =
{
return num1 + num2;
}
adder(5,5) ==> 10
adder("sare","ga") ==> error Type Mismatch. Because the return type is Int
def greetName(name:String) : String =
{
return s"Hello $name"
}
val fullgreet = greetName("Jose")
println(fullgreet)
Prime Number checking:
----------------------
def isPrime(numcheck:Int) : Boolean =
{
for ( n <- Range(2,numcheck))
{
if (numcheck % n == 0)
{
return false
}
}
return true
}
println(isPrime(10)) // false
println(isPrime(23)) // true
Spark DataFrames:
-----------------
What, How to, programming, Exercises
DataFrames are now the standard way of dealing with data for Scala and Spark
Spark is moving away from the RDD syntax in favor of a simpler to understand DataFrame Syntax
DataFrame : like large excel sheet like rows, columns
Standard way of using Spark's Machine learning capabilities
http://spark.apache.org/
Speed:
Run programs up to 100x faster than Hadoop MapReduce in memory, or 10x faster on disk
DataFrames Overview:
dfexa1.scala :
-----------
import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder().getOrCreate()
//val df = spark.read.csv("file.csv") /// header not included
val df = spark.read.option("header","true").option("inferSchema","true").csv("file.csv") // header included
//df.head(5)
df.columns
df.describe().show()
df.select("Volume")
for (row <- df.head(5))
{
println(row)
}
df.select($"Date",$"Close").show()
val df2 = df.withColumn("HighPlusLow",df("High")+df("Low"))
df2.printSchema()
df2("HighPlusLow").as("HPL") /// alias for that columns
df2.select(df2("HighPlusLow").as("HPL"),df2("Close")).show()
scala> :load dfexa1.scala // to execute scala program within scala prompt
Next lesson:
------------
import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder().getOrCreate()
val df = spark.read.option("header","true").option("inferSchema","true").csv("file:///C:/Spark/file.csv")
df.printSchema()
SQL Syntax and Scala Syntax ($ sign notation)
import spark.implicits._
Grab all the rows and columns
scala notation:
df.filter($"Close" > 480).show()
==> Close > 480 --- filtered result
SQL notation:
df.filter("Close > 480").show()
Filtered by Multiple columns:
------------------------------
scala notation:
---------------
df.filter($"Close" < 480 && $"High" < 480).show()
SQL notation:
-------------
df.filter("Close < 480 AND High < 480")
collect the result to scala object:
-----------------------------------
df.filter("Close < 480 AND High < 480").collect()
Filter is the Transformation
Show is the Action
val dfs = sqlContext.read.json("file:/home/cloudera/employee.json")
val dfs = sqlContext.read.json("file:/home/cloudera/file.csv")
import org.apache.spark.sql.types.{StructType,StructField,StringType,IntegerType};
import org.apache.spark.sql.Row;
val csv = sc.textFile("file:/home/cloudera/file.csv")
val rows = csv.map(line => line.split(",").map(_.trim))
val header = rows.first
val data = rows.filter(_(0) != header(0))
val rdd = data.map(row => Row(row(0),row(1).toInt))
val schema = new StructType().add(StructField("id", StringType, true)).add(StructField("val", IntegerType, true))
val df = sqlContext.createDataFrame(rdd, schema)
I was facing the same issue, after investigation i observed there was the compatibility issue between spark version and winutils.exe of hadoop-2.x.x.
After experiment i suggest you to use hadoop-2.7.1 winutils.exe with spark-2.2.0-bin-hadoop2.7 version and hadoop-2.6.0 winutils.exe with spark-1.6.0-bin-hadoop2.6 version and set below environment variables
SCALA_HOME : C:\Program Files (x86)\scala2.11.7;
JAVA_HOME : C:\Program Files\Java\jdk1.8.0_51
HADOOP_HOME : C:\Hadoop\winutils-master\hadoop-2.7.1
SPARK_HOME : C:\Hadoop\spark-2.2.0-bin-hadoop2.7
PATH : %JAVA_HOME%\bin;%SCALA_HOME%\bin;%HADOOP_HOME%\bin;%SPARK_HOME%\bin;
Create C:\tmp\hive diroctory and give access permission using below command
C:\Hadoop\winutils-master\hadoop-2.7.1\bin>winutils.exe chmod -R 777 C:\tmp\hive
Remove local Derby-based metastore metastore_db directory from Computer if it exist.
C:\Users\<User_Name>\metastore_db
Use below command to start spark shell
C:>spark-shell
What is, How to use, Programming, Exercises
-------------------------------------------
DataFrames : Standard way of dealing with data for Scala and Spark
RDD Syntax ==> DataFrame Syntax
DataFrame : Large spreadsheet with Rows X Columns
Standard way of using Spark's Machine Learning capabilities
RDD ( Out dated )
http://spark.apache.org
DataSet is a distributed collection of data
df: org.apache.spark.sql.DataFrame = [_c0: string, _c1: string ... 16 more fields]
import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder().getOrCreate()
val df = spark.read.option("header","true").option("inferSchema","true").csv("file:///C:/Spark/file.csv")
df.head(5)
for(row <- df.head(5))
{
println(row)
}
select a column:
df.select("county").show()
select multiple columns ($ infront of the column string)
df.select($"policyID",$"county").show()
|policyID|statecode| county|eq_site_limit|hu_site_limit|fl_site_limit|fr_site_limit| tiv_2011| tiv_2012|eq_site_deductible|hu_site_deductible|fl_site_deductible|fr_site_deductible|point_latitude|point_longitude| line| construction|point_granularity|superCool|
df.select($"policyID")
df.withColumn("superCool",df("policyID")+df("policyID"))
scala> df.printSchema()
root
|-- policyID: integer (nullable = true)
|-- statecode: string (nullable = true)
|-- county: string (nullable = true)
|-- eq_site_limit: double (nullable = true)
|-- hu_site_limit: double (nullable = true)
|-- fl_site_limit: double (nullable = true)
|-- fr_site_limit: double (nullable = true)
|-- tiv_2011: double (nullable = true)
|-- tiv_2012: double (nullable = true)
|-- eq_site_deductible: double (nullable = true)
|-- hu_site_deductible: double (nullable = true)
|-- fl_site_deductible: double (nullable = true)
|-- fr_site_deductible: integer (nullable = true)
|-- point_latitude: double (nullable = true)
|-- point_longitude: double (nullable = true)
|-- line: string (nullable = true)
|-- construction: string (nullable = true)
|-- point_granularity: integer (nullable = true)
https://github.com/anujdutt9/BigData-and-Machine-Learning
import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder().getOrCreate()
val df = spark.read.option("header","true").option("inferSchema","true").csv("file:///C:/Spark/exercise/CitiGroup2006_2008")
// scala notation
df.pridf.filter($"Close" > 480 ).show()
df.filter($"Close" < 480 && $"High" < 480).show()
//sql notation
df.filter("Close > 480").show()
df.filter("Close < 480.0 AND High < 480.0").show()
var CollectedHere = df.filter("Close < 480 AND High < 480").collect()
scala> var CollectedHere1 = df.filter("Close < 480 AND High < 480").count()
CollectedHere1: Long = 397
root
|-- Date: timestamp (nullable = true)
|-- Open: double (nullable = true)
|-- High: double (nullable = true)
|-- Low: double (nullable = true)
|-- Close: double (nullable = true)
|-- Volume: integer (nullable = true)
scala> df.filter($"High" === 484.40).show() /// Triple Equal Sign to compare
+-------------------+-----+-----+-----+-----+-------+
| Date| Open| High| Low|Close| Volume|
+-------------------+-----+-----+-----+-----+-------+
|2006-04-27 00:00:00|472.0|484.4|471.5|481.5|2464800|
+-------------------+-----+-----+-----+-----+-------+
scala> df.select(corr("High","Low")).show() // correlation between High and Low Column
+------------------+
| corr(High, Low)|
+------------------+
|0.9992999172726325|
+------------------+
Help Documation:
http://spark.apache.org/docs/latest/api/scala/index.html
scala> df.groupBy("Company").mean().show()
+-------+
|Company|
+-------+
| GOOG|
| FB|
| MSFT|
+-------+
scala> df.orderBy("Sales").show()
+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
| GOOG|Charlie| 120|
| MSFT| Amy| 124|
| GOOG| Sam| 200|
| MSFT|Vanessa| 243|
| GOOG| Frank| 340|
| FB| Sarah| 350|
| MSFT| Tina| 600|
| FB| Carl| 870|
+-------+-------+-----+
scala> df.orderBy("Person").show()
+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
| MSFT| Amy| 124|
| FB| Carl| 870|
| GOOG|Charlie| 120|
| GOOG| Frank| 340|
| GOOG| Sam| 200|
| FB| Sarah| 350|
| MSFT| Tina| 600|
| MSFT|Vanessa| 243|
+-------+-------+-----+
scala> df.orderBy("Sales").show()
+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
| GOOG|Charlie| 120|
| MSFT| Amy| 124|
| GOOG| Sam| 200|
| MSFT|Vanessa| 243|
| GOOG| Frank| 340|
| FB| Sarah| 350|
| MSFT| Tina| 600|
| FB| Carl| 870|
+-------+-------+-----+
scala> df.orderBy($"Sales".desc).show()
+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
| FB| Carl| 870|
| MSFT| Tina| 600|
| FB| Sarah| 350|
| GOOG| Frank| 340|
| MSFT|Vanessa| 243|
| GOOG| Sam| 200|
| MSFT| Amy| 124|
| GOOG|Charlie| 120|
+-------+-------+-----+
df.groupBy("Company").max().show()
df.groupBy("Company").min().show()
df.groupBy("Company").sum().show()
df.select(countDistinct("Sales")).show()
df.select(sumDistinct("Sales")).show()
df.select(variance("Sales")).show()
df.select(stdddev("Sales")).show()
df.select(collect_set("Sales")).show()
ContainsNULL.csv:
-----------------
Id,Name,Sales
emp1,John,
emp2,,
emp3,,345.0
emp4,Cindy,456.0
import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder().getOrCreate()
val df = spark.read.option("header","true").option("inferSchema","true").csv("file:///C:/Spark/exercise/ContainsNULL.csv")
df.printSchema()
df.show()
:load MissingData.scala
root
|-- Id: string (nullable = true)
|-- Name: string (nullable = true)
|-- Sales: double (nullable = true)
+----+-----+-----+
| Id| Name|Sales|
+----+-----+-----+ // Upto you to do any of the following :: No correct answer
|emp1| John| null| // May be you are ok with 1 or the other :: Keep them
|emp2| null| null| // You are not ok with both missing :: Drop them completely
|emp3| null|345.0| // Machine Learning : Fill the missing things with some other values.
|emp4|Cindy|456.0|
+----+-----+-----+
df.na. <tab>
drop fill replace
scala> df.show()
+----+-----+-----+
| Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+
scala> df.na.drop().show() // if any row contains at least a single NULL as column value the row will be ommitted.
+----+-----+-----+
| Id| Name|Sales|
+----+-----+-----+
|emp4|Cindy|456.0|
+----+-----+-----+
Drop any rows which has Minimum Number of NULL values
scala> df.na.drop(2).show() /// if any 2 fields are NULL, the record will be dropped
+----+-----+-----+
| Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+
// where ever numerical column which has NULL will be filled with 100
scala> df.na.fill(100).show() // fill NULL with 100
+----+-----+-----+
| Id| Name|Sales|
+----+-----+-----+
|emp1| John|100.0| // 100 is filled here in the place of NULL
|emp2| null|100.0| // 100 is filled here in the place of NULL
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+
// where ever string column which has NULL will be filled with "Missing"
scala> df.na.fill("Missing").show()
+----+-------+-----+
| Id| Name|Sales|
+----+-------+-----+
|emp1| John| null|
|emp2|Missing| null|
|emp3|Missing|345.0|
|emp4| Cindy|456.0|
+----+-------+-----+
scala> df.na.fill("New Name",Array("Name")).show()
+----+--------+-----+
| Id| Name|Sales|
+----+--------+-----+
|emp1| John| null|
|emp2|New Name| null|
|emp3|New Name|345.0|
|emp4| Cindy|456.0|
+----+--------+-----+
scala> df.na.fill(200.00,Array("Sales")).show()
+----+-----+-----+
| Id| Name|Sales|
+----+-----+-----+
|emp1| John|200.0|
|emp2| null|200.0|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+
df.select(month(df("Date"))).show()
df.select(year(df("Date"))).show()
val df2 = df.withColumn("Year",year(df("Date")))
val dfmins = df2.groupBy("Year").min()
dfmins.select($"Year",$"min(Close)").show()
V2 Maestros:
------------
Apache Spark is the top level open source project under the Apache Umbrella
Tool / Engine / Language
Fast and General Engine for large-scale distributed data processing
Faster than Apache Hadoop (Batch Processing)
Hadoop is slower
Any kind of general purpose analytics
Open source cluster computing framework (large number of cheap nodes)
End - To - End Analytics platforms
Data Ingestion
Data Transformation
Machine Learning
Interactive analytics to taking action
Developed to overcome the limitations of MapReduce
Runs from a single Desktop or a huge cluster
Single Windows System
Iterative capability - like step#1, step#2 ....
interactive : login to shell and execute some code, transformation, look @ the result
or stream processing (real time) : Data is coming from Sociam networking sites
Scala,Python,R,Java :: You can do programming in any of the 4 languages
Adv:
Fast growing open Source Engine
100X faster than MapReduce in memory - Keep data in memory
10X faster than MapReduce in disk
Run alongside other Hadoop Components
Support for many programming langauges:
Scala, R, Python,Java,Piping
Same functionality across multiple languages
Libraries:
Graph, SQL, Machine Language, Streaming
Hadoop, MapReduce:
Disk based
slower
limited opreations
very batch oriented
Spark Use Cases:
Data Integration and ETL (Extract Transform Load)
Interactive Analytics using Spark Shell
High Performance Batch computation
Machine learning and advanced analytics
moving from 1 Algorithm to another Algorithm is simple and similar
Real time stream processing
Example applications:
Credit card fraud detection, network detection
Network Intrusion Detection
Advertisement Targeting : click stream, what kind of ad you want to show for that person
Spark Workflow:
Loading from data source to :
HDFS, NoSQL, S3, real time sources
Transform Data:
Filter, Clean, Join, Enhance
Stored Processed Data:
shell,Spark SQL, 3rd party tools (JDBC)
Machine Learning
with Real Time Data
Spark Framework:
Programming
Library
Engine
Management
Storage
Engine:
Core of SPark execute code
Take data, splitup the data across multiple clusters
perform the operations and return back the results
interfaces to get / store data
Management:
How to manage Spark?
YARN Yet Another Resource Negation - distributed scheduling system that comes with MapReduce
YARN can manage Spark as well
CDH - Cludera Distribution of Hadoop which comes with YARN, Hadoop, Spark
Apache Mesos - one more Apache Project which can manage Spark
Spark has its own builtin scheduler named : Spark Scheduler to control its clusters
You can use any of these 3 (YARN / Mesos / Spark Scheduler)
Management interfaces
Storage
:
Spark has to read data from data sources and store them in clusters / memory and manipulate data.
Local File Systems (Storing / Reading / Writing)
HDFS - Hadoop Distributed File System
S3 - Cloud Storage
RDBMS - Read and Write from / to RDBMS
NoSQL storage : Cassandra, HBase, MongoDB
Libraries:
Libraries for Manipualte data
Spark SQL : provides easy SQL like interface for Spark
Easy way of doing : Read, Analyze, Write data, Grouping, Filtering
MLLib : Machine Learning Library
Spark supports more number of Algorithms than Hadoop
Write / understand / switch between algorithms are easy
GraphX - Graph Analysis
Streaming - to analyze real time data (data come from TCP/IP socket, twitter feed)
Lookup data, analyze, results the data using Machine Learning Algorithms
Predictions / Take actions
Programming: Scala, Python, Java, R
Spark is written in Scala
Scala is written in Java
All libraries in Scala is available in Python (similar)
Shell Interaction is available one fro Scala, Python, R and not for Java
REPL - Read Evaluvate Print Loop
3rd party Tools - JDBC connection (get,analyze,results data)
ETL, Reporting
RDDs : Resilient Distributed DataSet
Spark is built around RDDs
Create, Transform, Analyze and store RDDs in a Spark Program
DataSet :
collection of elements of any type.
any type : Strings, Lines, rows, objects, collections
Dataset can be partitioned and distributed across multiple nodes. ::>> to do parallel processing
RDDs are immutable (They can't be changed)
If you want to make change in RDD then, create a new RDD with your transformation (what ever changes you want)
Once the RDD is created, you can't change them
RDDs can be cached and persisted
Transformations:
Operations
things that you do on RDD which create new RDD (Transformation - filter,grouping)
Actions : Analyze the RDDs and provides the result
Transformation : produce New RDD
Actions : produce a result
Master Node
Driver Program : main guy who drives your program
Spark Context : gateway to all spark functionality
Database connection, know the correlation
Cluster Manager:
Manage various jobs
Driver program and Cluster Manager takes care of executing the job across the cluster
Spark job is split into >>> Tasks
Tasks are distributed across worker nodes (clusters)
Anytime RDD created, that will be distributed across various worker nodes
RDD is taken and partitioned and distributed
Execution result will be back to Spark
Worker Node
Executor
Task
Cache
Worker nodes are the slave nodes who can execute the tasks
Worker Node
Executor
Task
Cache
We can increase the worker nodes parallely to improve the performance
If we have more worker nodes meaning we have more and more memory, it can help to keep more RDDs in memory
Data in Memory can perform faster than data in disks
Spark Scalability:
Single JVM (Development / Learning machine)
Runs ona single box
All components (Driver, Executors) run within the same JVM
Managed Cluster:
can scale from 2 to 1000s of nodes
can use any cluster manager for managing nodes
Data is distributed and processed on all nodes
Driver program:
Main executable program from where Spark operations are performed
Controls and co-ordinates all operations
Driver program is the "Main" class
Executes parallel operations on a cluster
Defines RDDs
Each driver program executes a Job
Spark Context:
Driver accesses Spark functionality through SparkContext object
Represents a connection to the computing cluster
Used to build RDDs
Works with the cluster manager
Manages executors running on Worker Nodes
Splits Jobs as parallel "Tasks" and executes them on worker nodes
Partitions RDDs and distributes them on the cluster
Spark Execution Modes:
Batch Mode:
A program is scheduled for execution through the scheduler
Runs fully at periodic intervals and processes data
No user input @ run time for batch mode processed
It takes input DB / output DB connection informations from config files
It read, manipulate, write data in data synchs
Interactive Mode:
Interactive shell is used by the user to execute Spark Commands 1 by 1
Shell acts as the Driver Program and provides SparkContext
Can run tasks on a cluster
Even if you run in Shell also, you are actually running the tasks in clusters
Development purpose / Ad-hoc analysis
Streaming Mode:
An always running program continuously processes data s it arrives
Read data -> Do Transformations -> write results
Lazy Evaluation:
Spark will not load or transform data unless an action is performed
1) Load file into RDDs
2) Filter the RDDs
3) Count number of elements (Only now loading and filtering happens)
Transformations will be done only when any Action called
Helps internally optimize operations and resource usage
Life easy for developers - Can write chaining operations
Watch out during troubleshooting
Errors found while executing actions might be related to earlier transformations
File name is wrong while load - it wont show error
Do Count words - now it will tell file name is wrong - error will be shown only when Action is performed
Derek Banas:
------------
Explore the world of Functional programming language
scala> abs(-8)
cbrt(9)
ceil(5.45)
round(5.45)
floor(5.45)
exp(1)
pow(2,2)
sqrt(pow(2,2))
(random * (11-1) + 1).toInt
toRadians(90)
toDegrees(1.570796)
var age = 18
val canVote = if(age >= 18) "yes" else "no"
if ( (age >= 5) && (age <= 6))
{
println("Go to Kindergarten")
}
else if( (age >6) && (age <= 7))
{
println ("Go to Grade #1")
}
else
{
println("Go to Grade " + (age - 5))
}
true || false
Looping:
---------
object ScalaTutorial
{
def main(args: Array[String])
{
var i = 0
while (i <= 10)
{
println(i)
i+= 1
}
}
}
Save this file as ScalaTutorial.scala
Go to that folder where it is stored.
run it using : scala ScalaTutorial.scala
0
1
2
3
4
5
6
7
8
9
10
object ScalaTutorial2
{
def main(args:Array[String])
{
var i = 0
val aList = List(1,2,3,4,5,6,7,8,9,10)
for (i <- aList)
{
println("List Items "+ i)
}
}
}
C:\scala\exercise>scala ScalaTutorial2.scala
List Items 1
List Items 2
List Items 3
List Items 4
List Items 5
List Items 6
List Items 7
List Items 8
List Items 9
List Items 10
import scala.io.StdIn.{readLine, readInt}
import scala.math._
import scala.collection.mutable.ArrayBuffer
import java.io.PrintWriter
import scala.io.Source
object ScalaTutorialGuess
{
def main(args:Array[String])
{
var numberGuess = 0
do
{
print("Guess a number")
numberGuess = readLine.toInt
} while (numberGuess != 15)
printf("You guessed the secret number %d\n",15)
}
}
import scala.io.StdIn.{readLine, readInt}
import scala.math._
import scala.collection.mutable.ArrayBuffer
import java.io.PrintWriter
import scala.io.Source
object ScalaTutorialGuess
{
def main(args:Array[String])
{
var name = "Derek"
val age = 39
val weight = 175.5
println(s"Hello $name")
println(s"Hello ${name}")
println(f"I am ${age + 1} and weigh $weight%.2f")
}
}
var randSent = "I saw a dragon fly by"
println(randSent.indexOf("dragon")
randSent(3)
randSent.length
randSent.concat(" and explode!")
"I saw a dragon".equals(randSent)
printf("'%5d'\n",5)
printf("'%-5d'\n",5)
Functions:
----------
def functionName (param1:dataType,param2:datatype) : returnType =
{
function body
return valueToReturn
}
Named arguments:
--------------------
import scala.io.StdIn.{readLine, readInt}
import scala.math._
import scala.collection.mutable.ArrayBuffer
import java.io.PrintWriter
import scala.io.Source
object FunctionExa2
{
def main(args:Array[String])
{
def getSum(num1:Int,num2:Int):Int =
{
return num1+num2
}
println("5+4="+getSum(num2=5,num1=4))
}
}
Void functions - No return value from the function
just side effects
------------------
object VoidFunction
{
def main(args:Array[String])
{
def sayHi():Unit =
{
println("Hi!.. How are You?")
}
sayHi
sayHi()
}
}
Multiple Arguments with same data type:
---------------------------------------
object MultiArgs
{
def main(args: Array[String])
{
def getSum(args: Int*) : Int =
{
var sum : Int = 0
for(num <- args)
{
sum += num
}
return sum
}
println("Get Sum " + getSum(1,23,4,5,6))
}
}
Factorial : Recursive Function
-----------------------------
import scala.io.StdIn.{readLine, readInt}
import scala.math._
import scala.collection.mutable.ArrayBuffer
import java.io.PrintWriter
import scala.io.Source
object MultiArgs
{
def main(args: Array[String])
{
def factorial(num:BigInt) : BigInt =
{
if (num <= 1)
{
return 1
}
else
{
num * factorial(num - 1)
}
}
println("Factorial of 4 =" + factorial(4))
}
}
val friends = Array("Bob","Tom")
friends(0) = "Sue"
scala> println(f"Best Friend : ${friends(0)}")
Best Friend : Sue
scala> println(s"Best Friend : ${friends(0)}")
Best Friend : Sue
val friends2 = scala.collection.mutable.ArrayBuffer[String]()
friends2.insert(0,"Phil")
friends2 += "Mark"
friends2 ++= Array("Susy","Paul")
friends2.insert("1","Mike","Sally","Sam","Mary","Sue")
scala> friends2
res15: scala.collection.mutable.ArrayBuffer[String] = ArrayBuffer(Phil, Mike, Sally, Sam, Mary, Sue, Mark, Susy, Paul)
scala> friends2.remove(1,2)
scala> friends2
res17: scala.collection.mutable.ArrayBuffer[String] = ArrayBuffer(Phil, Sam, Mary, Sue, Mark, Susy, Paul)
scala> val friend:String =""
friend: String = ""
scala> for (friend <- friends2) println(friend)
Phil
Sam
Mary
Sue
Mark
Susy
Paul
scala> val favNums = new Array[Int](20)
favNums: Array[Int] = Array(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
scala> for (j <- 0 to (favNums.length - 1))
| {
| favNums(j) = j
| println(favNums(j))
| }
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
val favNumsTimes2 = for(num <- favNums) yield 2*num
favNumsTimes2.foreach(println)
var favNumsDiv4 = for(num <- favNums if num % 4 == 0)
favNumsDiv4.foreach(println)
Multiplication table:
---------------------
import scala.io.StdIn.{readLine, readInt}
import scala.math._
import scala.collection.mutable.ArrayBuffer
import java.io.PrintWriter
import scala.io.Source
object MultiplicationTable
{
def main(args: Array[String])
{
var mulTable = Array.ofDim[Int](10,10)
for(i <- 0 to 9)
{
for(j <- 0 to 9)
{
mulTable(i)(j) = i*j
}
}
for (i <- 0 to 9)
{
for (j <- 0 to 9)
{
println(s"$i * $j = ${mulTable(i)(j)}")
}
}
}
}
favNums.sortWith(_>_)
favNums.sortWith(_<_)
sortedNums.deep.mkString(", ")
val employees = Map("Manager" -> "Bob Smith", "Secretary" -> "Sue Brown")
if (employees.contains("Manager"))
{
println("Manager : " + employees("Manager"))
}
Mutable Map:
------------
val customers = collection.mutable.Map(100 -> "Raj",101 -> "Arun")
customers(100) = "Tom Marks"
customers(102) = "Megan Swift"
for((k,v) <- customers)
{
println(s"$k : $v")
}
Answer:
101 : Arun
100 : Tom Marks
102 : Megan Swift
scala> var tupleMerge = (101,"Merge Simpson",10.25)
tupleMerge: (Int, String, Double) = (101,Merge Simpson,10.25)
scala> tupleMerge.productIterator.foreach(i => println(i))
101
Merge Simpson
10.25
scala> tupleMerge.toString
res10: String = (101,Merge Simpson,10.25)
scala> tupleMerge.toString()
res11: String = (101,Merge Simpson,10.25)
object HelloWorld {
def main(args: Array[String]) {
println("Hello, world!")
}
}
scala> :t 234
Int
scala> :t "sarega"
String
scala> :t 's'
Char
scala> :t 234.234
Double
scala> :t 1 < 1
Boolean
object Demo {
def main(args: Array[String]) {
var myVar :Int = 10;
val myVal :String = "Hello Scala with datatype declaration.";
var myVar1 = 20;
val myVal1 = "Hello Scala new without datatype declaration.";
println(myVar); println(myVal); println(myVar1);
println(myVal1);
}
}
run it : scala Demo.scala
-------------------------------------------------------------------
scala> :paste
// Entering paste mode (ctrl-D to finish)
val a = 10
val b = 20
val c = 30
print (a+b+c)
// Exiting paste mode, now interpreting.
60a: Int = 10
b: Int = 20
c: Int = 30
scala> val i = 1+2
i: Int = 3
scala> val i = 1.+(2)
i: Int = 3
scala> print("Hello World")
Hello World
scala> val s = "Hello World"
s: String = Hello World
scala> s.toUpperCase
res8: String = HELLO WORLD
scala> s.toLowerCase
res9: String = hello world
scala> s.substring(6)
res11: String = World
scala> s.substring(6,8)
res12: String = Wo
s.length -- Tab
scala> s.length
length lengthCompare
scala> s.length -- Tab and one more Tab again
def length(): Int
for (i <- (1 to 100)) { print(i + " ")}
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
scala> :paste
// Entering paste mode (ctrl-D to finish)
var total = 0
for (element <- (1 to 100))
total += element
// Exiting paste mode, now interpreting.
total: Int = 5050
var total = 0
for (element <- (1 to 100))
{
if (element % 2 == 0) total += element
}
// Exiting paste mode, now interpreting.
total: Int = 2550
scala> :paste
// Entering paste mode (ctrl-D to finish)
var totalEven = 0
var totalOdd = 0
for (element <- (1 to 100))
{
if (element % 2 == 0) totalEven += element else totalOdd += element
}
// Exiting paste mode, now interpreting.
totalEven: Int = 2550
totalOdd: Int = 2500
scala> :paste
// Entering paste mode (ctrl-D to finish)
var lb = 1
var ub = 100
var totalEven = 0
var totalOdd = 0
while (lb <= ub)
{
if (lb % 2 == 0) totalEven += lb else totalOdd += lb
lb += 1
}
// Exiting paste mode, now interpreting.
lb: Int = 101
ub: Int = 100
totalEven: Int = 2550
totalOdd: Int = 2500
^l to clear screen (Ctrl + l)
scala> :paste
// Entering paste mode (ctrl-D to finish)
def sum(lb: Int, ub: Int) = {
var total = 0
for(element <- lb to ub)
{
total += element
}
total
}
// Exiting paste mode, now interpreting.
sum: (lb: Int, ub: Int)Int
scala> sum (Tab twice)
def sum(lb: Int,ub: Int): Int
scala> sum(1,10)
res22: Int = 55
scala> :paste
// Entering paste mode (ctrl-D to finish)
def sum(func: Int => Int, lb: Int, ub: Int) = {
var total = 0
for(element <- lb to ub)
{
total += func(element)
}
total
}
// Exiting paste mode, now interpreting.
sum: (func: Int => Int, lb: Int, ub: Int)Int
scala> def id(i: Int) = i
id: (i: Int)Int
scala> def sqr(i: Int) = i * i
sqr: (i: Int)Int
scala> def cube (i : Int) = i * i * i
cube: (i: Int)Int
scala> def double (i: Int) = i * 2
double: (i: Int)Int
Function arguments :
don't give val or var. neither mutable nor immutable
have to give data type
def functionname(arg1 : datatype, arg2 : datatype) : return type
Create a Class:
----------------
scala> class Order(orderId : Int, orderDate : String, orderCustomerId : Int, orderStatus : String){
| println("I am inside Order Constructor")
| }
defined class Order
compile it:
------------
:javap -p Order
Compiled from "<console>"
public class $line3.$read$$iw$$iw$Order {
public $line3.$read$$iw$$iw$Order(int, java.lang.String, int, java.lang.String);
}
scala> val order = new Order(1,"2013-10-01 00:00:00.000", 100,"COMPLETE")
I am inside Order Constructor
order: Order = Order@182cc69e
scala> class Order(orderId : Int, orderDate : String, orderCustomerId : Int, orderStatus : String){
| println ("I am inside Order constructor")
| override def toString = " Order ( " + orderId + "," + orderDate + " ," + orderCustomerId + ", " + orderStatus + ")"
| }
defined class Order
scala> val order = new Order(1,"2013-10-01 00:00:00.000", 100,"COMPLETE")
I am inside Order constructor
order: Order = Order ( 1,2013-10-01 00:00:00.000 ,100, COMPLETE)
scala> println(order);
Order ( 1,2013-10-01 00:00:00.000 ,100, COMPLETE)
REPL : Read Evaluate Print Loop
scala> 10 + 3 * 5 / 2
res0: Int = 17
scala> "Your answer is " + res0
res1: String = Your answer is 17
// var is going to change - variable
scala> var myName = "Derek"
myName: String = Derek
//val is not going to change -- constant / never change
scala> val myAge = 40
myAge: Int = 40
comments
// single line comments
/*
multi line comments
*/
Data Types:
Byte, Boolean, Char, Short, Int, Long, Float, Double
BigInt
scala> val lgPrime = BigInt("6222288956456456456894864564648947895615648978945616549789641561489489489461564894894615618944561564")
lgPrime: scala.math.BigInt = 6222288956456456456894864564648947895615648978945616549789641561489489489461564894894615618944561564
scala> lgPrime+1
res3: scala.math.BigInt = 6222288956456456456894864564648947895615648978945616549789641561489489489461564894894615618944561565
scala> println ("5 + 4 = " + ( 5 + 4 ))
5 + 4 = 9
scala> println ("5 - 4 = " + ( 5 - 4 ))
5 - 4 = 1
scala> println ("5 * 4 = " + ( 5 * 4 ))
5 * 4 = 20
scala> println ("5 / 4 = " + ( 5 / 4 ))
5 / 4 = 1
scala> println ("5 % 4 = " + ( 5 % 4 ))
5 % 4 = 1
import math library:
------------------------
scala> import scala.math._
import scala.math._
scala> ceil(5.45)
res9: Double = 6.0
scala> round(5.45)
res10: Long = 5
scala> floor(5.45)
res11: Double = 5.0
scala> exp(1)
res12: Double = 2.718281828459045
scala> pow(2,2)
res13: Double = 4.0
scala> sqrt(pow(2,2) + pow(2,2))
res14: Double = 2.8284271247461903
scala> hypot(2,2)
res15: Double = 2.8284271247461903
scala> log10(1000)
res16: Double = 3.0
scala> log(2.7182818284590455)
res17: Double = 1.0
scala> min(5,10)
res20: Int = 5
scala> max(1,1000)
res21: Int = 1000
scala> (random * (11-1) + 1).toInt
res22: Int = 7
scala> (random * (11-1) + 1).toInt
res23: Int = 3
scala> var age = 19
age: Int = 19
scala> val canVote = if (age >= 18) "yes" else "no"
canVote: String = yes
// Multiline coding within scala shell CLI (Command Line Interface)
scala> :paste
// Entering paste mode (ctrl-D to finish)
var age = 17
val canVote = if (age >= 18) "yes" else "no"
// Exiting paste mode, now interpreting.
age: Int = 17
canVote: String = no
Exit from Scala
----------------
scala> :q // quit from Scala
C:\scala\bin>
^D to exit
Run Eclipse from here:
C:\scalaIDE\eclipse>eclipse.exe
eclipse:
---------
object ScalaTutorial
{
def main(args : Array[String])
{
var i = 0
while (i <= 10)
{
println(i)
i += 1
}
}
}
4
5
6
7
8
9
10
in CLI:
--------
Run .scala program in CLI
C:\scala\exercise>scala sa.scala
0
1
2
3
4
5
6
7
8
9
10
object ScalaTutorial
{
def main(args : Array[String])
{
var i = 0
while (i <= 10)
{
println(i)
i += 1
}
}
}
Result:
0
1
2
3
4
5
6
7
8
9
10
object ScalaTutorial
{
def main(args : Array[String])
{
var i = 0
do {
println(i)
i += 1
}while (i <= 10)
}
}
Result:
-------
0
1
2
3
4
5
6
7
8
9
10
object ScalaTutorial
{
def main(args : Array[String])
{
var i = 0
for(i <- 1 to 10)
{
println(i)
}
}
}
Result:
1
2
3
4
5
6
7
8
9
10
object ScalaTutorial
{
def main(args : Array[String])
{
var i = 0
val randLetters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
for (i <- 0 until randLetters.length)
{
println(randLetters(i))
}
}
}
Result:
-------
A
B
C
D
E
F
G
H
I
J
K
L
M
N
O
P
Q
R
S
T
U
V
W
X
Y
Z
object ScalaTutorial
{
def main(args : Array[String])
{
var i = 0
val aList = List(1,2,3,4,5,6)
for (i <- aList)
{
println ("List Items #"+i)
}
}
}
Result:
-------
List Items #1
List Items #2
List Items #3
List Items #4
List Items #5
List Items #6
object ScalaTutorial
{
def main(args : Array[String])
{
var i = 0
var evenList = for { i <- 1 to 20
if ( i % 2 == 0)
} yield i
for(i <- evenList)
println(i)
}
}
Result:
2
4
6
8
10
12
14
16
18
20
object ScalaTutorial
{
def main(args : Array[String])
{
var i = 0
for (i <- 1 to 5; j <- 6 to 10)
{
println ("i : " + i)
println ("j : " + j)
}
}
}
Result:
--------
i : 1
j : 6
i : 1
j : 7
i : 1
j : 8
i : 1
j : 9
i : 1
j : 10
i : 2
j : 6
i : 2
j : 7
i : 2
j : 8
i : 2
j : 9
i : 2
j : 10
i : 3
j : 6
i : 3
j : 7
i : 3
j : 8
i : 3
j : 9
i : 3
j : 10
i : 4
j : 6
i : 4
j : 7
i : 4
j : 8
i : 4
j : 9
i : 4
j : 10
i : 5
j : 6
i : 5
j : 7
i : 5
j : 8
i : 5
j : 9
i : 5
j : 10
import scala.io.StdIn.{readLine, readInt}
import scala.math._
import scala.collection.mutable.ArrayBuffer
import java.io.PrintWriter
import scala.io.Source
object ScalaTutorial
{
def main(args : Array[String])
{
var numberGuess = 0
do {
print ("Guess a number ")
numberGuess = readLine.toInt
}while (numberGuess != 15)
printf ("You guessted the secret Number %d",15)
}
}
Result:
-------
Guess a number 3
Guess a number 2
Guess a number 4
Guess a number 15
You guessted the secret Number 15
object ScalaTutorial
{
def main(args : Array[String])
{
val name = "Derek"
val age = 39
val weight = 175.5
println(s"Hello $name")
println(f"I am ${age+1} and weigh $weight%.2f")
}
}
Result:
Hello Derek
I am 40 and weigh 175.50
%c --> character
%s --> string
%f --> float
%d --> integer
object ScalaTutorial
{
def main(args : Array[String])
{
printf("'%5d'\n",5)
printf("'%05d'\n",5)
printf("'%-5s'\n",5)
printf("'%5s'\n",5)
}
}
' 5'
'00005'
'5 '
' 5'
object ScalaTutorial
{
def main(args : Array[String])
{
var randSent = "I saw a dragon fly by"
println ("3rd index : " + randSent(3))
println("String length : " + randSent.length)
println(randSent.concat(" and explode"))
println("Are strings equal : " + "I saw a dragon".equals(randSent))
println("dragon starts at index : " + randSent.indexOf("dragon"))
val randSentArray = randSent.toArray
for (v <- randSentArray)
println (v)
}
}
Result:
-------
3rd index : a
String length : 21
I saw a dragon fly by and explode
Are strings equal : false
dragon starts at index : 8
I
s
a
w
a
d
r
a
g
o
n
f
l
y
b
y
function template:
------------------
def funcName (param1:dataType, param2:dataType) : returnType = {
// function body
// return valueToReturn
}
object ScalaTutorial
{
def main(args : Array[String])
{
def getSum(n1 : Int = 1, n2 : Int = 2) : Int = {
return n1 + n2 ;
}
println ( " 5 + 4 = " + getSum(5,4))
}
}
Result:
5 + 4 = 9
object ScalaTutorial
{
def main(args : Array[String])
{
def getSum(n1 : Int = 1, n2 : Int = 2) : Int = {
return n1 + n2 ;
}
println ( " 5 + 4 = " + getSum(5,4))
println("result= " + getSum(n2 = 3, n1 = 10)) // named arguments
}
}
Result:
5 + 4 = 9
result= 13
// Unit means void function - no return from the function
object ScalaTutorial
{
def main(args : Array[String])
{
def sayHi(): Unit = {
println("Hi! How are you?")
}
sayHi
}
}
Result :
Hi! How are you?
// variable number of parameters in a function
object ScalaTutorial
{
def main(args : Array[String])
{
def getSum(args : Int*) : Int = {
var sum : Int = 0
for (num <- args) {
sum += num
}
sum
}
println ("Get Sum " + getSum(1,2,3,4,5,6))
println ("Get Sum " + getSum(100,200, 300))
}
}
Get Sum 21
Get Sum 600
object ScalaTutorial
{
def main(args : Array[String])
{
def factorial(num : BigInt) : BigInt = {
if (num <= 1)
1
else
num * factorial (num - 1)
}
println("Factorial of 4 = " + factorial(4))
println("Factorial of 5 = " + factorial(5))
}
}
Result:
Factorial of 4 = 24
Factorial of 5 = 120
object ScalaTutorial
{
def main(args : Array[String])
{
val favNums = new Array[Int] (20)
val friends = Array("Bob","Tom")
friends(0) = "Sue"
println("Best Friends " + friends(0))
val friends2 = ArrayBuffer[String]()
friends2.insert(0,"Phil")
friends2 += "Mark"
friends2 ++= Array("Susy","Paul")
friends2.insert(1,"Mike","Sally","Sam","Mary","Sue")
friends2.remove(1,2)
var friend : String = ""
for(friend <- friends2)
println(friend)
}
}
Result:
Best Friends Sue
Phil
Sam
Mary
Sue
Mark
Susy
Paul
object ScalaTutorial
{
def main(args : Array[String])
{
val myList = List(1,2,3,4,5,6,7,8,9,10)
myList.foreach(println)
}
}
Result:
--------
1
2
3
4
5
6
7
8
9
10
object ScalaTutorial
{
def main(args : Array[String])
{
var mulTable = Array.ofDim[Int](10,10)
for(i <- 0 to 9)
{
for (j <- 0 to 9)
{
mulTable(i)(j) = i * j
}
}
for (i <- 0 to 9)
{
for (j <- 0 to 9)
{
printf ("%d : %d = %d\n",i,j,mulTable(i)(j))
}
}
}
}
Result:
0 : 0 = 0
0 : 1 = 0
0 : 2 = 0
0 : 3 = 0
0 : 4 = 0
0 : 5 = 0
0 : 6 = 0
0 : 7 = 0
0 : 8 = 0
0 : 9 = 0
1 : 0 = 0
1 : 1 = 1
1 : 2 = 2
1 : 3 = 3
1 : 4 = 4
1 : 5 = 5
1 : 6 = 6
1 : 7 = 7
1 : 8 = 8
1 : 9 = 9
2 : 0 = 0
2 : 1 = 2
2 : 2 = 4
2 : 3 = 6
2 : 4 = 8
2 : 5 = 10
2 : 6 = 12
2 : 7 = 14
2 : 8 = 16
2 : 9 = 18
3 : 0 = 0
3 : 1 = 3
3 : 2 = 6
3 : 3 = 9
3 : 4 = 12
3 : 5 = 15
3 : 6 = 18
3 : 7 = 21
3 : 8 = 24
3 : 9 = 27
4 : 0 = 0
4 : 1 = 4
4 : 2 = 8
4 : 3 = 12
4 : 4 = 16
4 : 5 = 20
4 : 6 = 24
4 : 7 = 28
4 : 8 = 32
4 : 9 = 36
5 : 0 = 0
5 : 1 = 5
5 : 2 = 10
5 : 3 = 15
5 : 4 = 20
5 : 5 = 25
5 : 6 = 30
5 : 7 = 35
5 : 8 = 40
5 : 9 = 45
6 : 0 = 0
6 : 1 = 6
6 : 2 = 12
6 : 3 = 18
6 : 4 = 24
6 : 5 = 30
6 : 6 = 36
6 : 7 = 42
6 : 8 = 48
6 : 9 = 54
7 : 0 = 0
7 : 1 = 7
7 : 2 = 14
7 : 3 = 21
7 : 4 = 28
7 : 5 = 35
7 : 6 = 42
7 : 7 = 49
7 : 8 = 56
7 : 9 = 63
8 : 0 = 0
8 : 1 = 8
8 : 2 = 16
8 : 3 = 24
8 : 4 = 32
8 : 5 = 40
8 : 6 = 48
8 : 7 = 56
8 : 8 = 64
8 : 9 = 72
9 : 0 = 0
9 : 1 = 9
9 : 2 = 18
9 : 3 = 27
9 : 4 = 36
9 : 5 = 45
9 : 6 = 54
9 : 7 = 63
9 : 8 = 72
9 : 9 = 81
object ScalaTutorial
{
def main(args : Array[String])
{
var r = scala.util.Random
val favNums = new Array[Int] (11)
for (i <- 0 to 10)
{
favNums(i) = r.nextInt(10)
println(favNums(i))
}
for (i <- 0 to 10)
{
println(favNums(i))
}
println("Sum : " + favNums.sum)
println("Min : " + favNums.min)
println("Max : " + favNums.max)
var sortedNums = favNums.sortWith(_>_)
println(sortedNums.deep.mkString(","))
}
}
Result:
-----------
2
8
3
5
9
5
4
5
4
4
2
2
8
3
5
9
5
4
5
4
4
2
Sum : 51
Min : 2
Max : 9
9,8,5,5,5,4,4,4,3,2,2
--------------------------------------------------------------------
scala> 100
res5: Int = 100
scala> 2.5
res6: Double = 2.5
scala> 1 + 1
res7: Int = 2
scala> 2 - 1
res8: Int = 1
scala> 2 * 5
res9: Int = 10
scala> 1 / 2
res10: Int = 0
scala> 1.0 / 2
res11: Double = 0.5
scala> 1 / 2.0
res12: Double = 0.5
scala> 1.0 / 2.0
res13: Double = 0.5
scala> 1 / 2.0 / 0.5
res14: Double = 1.0
scala> math.pow(4,2)
res17: Double = 16.0
scala> 1 + 2 * 3 + 4
res18: Int = 11
scala> 1 + (2 * 3) + 4
res19: Int = 11
scala> (1+2) * (3+4)
res20: Int = 21
val : immutable - non changeable
var : mutable - can be reassigned (you must use the same data type while reassigning)
val <name> : <type> = <literal>
var <name> : <type> = <literal>
val firstNumber : Int = 100
var firstName : String = "Raja"
scala> val firstNumber : Int = 100
firstNumber: Int = 100
scala> var firstName : String = "Raja"
firstName: String = Raja
scala> var myvar : Int = 10
myvar: Int = 10
scala> val myval : Double = 2.5
myval: Double = 2.5
scala> val sa : Float = 5.3f
sa: Float = 5.3
scala> var myvar : Int = 100
myvar: Int = 100
// type mismatch
scala> myvar = "sarega"
<console>:25: error: type mismatch;
found : String("sarega")
required: Int
myvar = "sarega"
^
// reassignment to val error
scala> val myval = 100
myval: Int = 100
scala> myval = "sarega"
<console>:25: error: reassignment to val
myval = "sarega"
^
scala> myval = 23
<console>:25: error: reassignment to val
myval = 23
^
scala> val c = 12
c: Int = 12
scala> val my_string = "Hello"
my_string: String = Hello
scala> val 2my_String ="Hello"
<console>:1: error: Invalid literal number
val 2my_String ="Hello"
^
scala> val my.string = "Hello"
<console>:23: error: not found: value my
val my.string = "Hello"
^
// tilde symbol is allowed
scala> val `my.string` = "Hello"
my.string: String = Hello
scala> `my.string`
res23: String = Hello
scala> true
res24: Boolean = true
scala> false
res25: Boolean = false
scala> 1 > 2
res26: Boolean = false
scala> 1 < 2
res27: Boolean = true
scala> 1 == 1
res28: Boolean = true
scala> 1 == 1.0
res29: Boolean = true
scala> 1 >= 1
res30: Boolean = true
scala> 1 <= 30
res31: Boolean = true
scala> 2 == 2
res32: Boolean = true
scala> 2 != 4
res33: Boolean = true
//Modulo
scala> 4 % 2
res34: Int = 0
scala> 10 % 3
res35: Int = 1
scala> 5 % 2
res36: Int = 1
scala> 19 % 5
res37: Int = 4
scala> println("hello")
hello
scala> print("hello")
hello
scala> println('zilog')
<console>:1: error: unclosed character literal
println('zilog')
^
scala> val farewell = "good"+"bye"
farewell: String = goodbye
scala> "good"+"bye"
res40: String = goodbye
scala> "dance "*5
res41: String = "dance dance dance dance dance "
scala> " _ "*10
res42: String = " _ _ _ _ _ _ _ _ _ _ "
scala> val st = "hello"
st: String = hello
scala> st
res45: String = hello
scala> st.co
codePointAt collect compare concat copyToArray
codePointBefore collectFirst compareTo contains copyToBuffer
codePointCount combinations compareToIgnoreCase containsSlice corresponds
codePoints companion compose contentEquals count
scala> st.concat("sare")
res46: String = hellosare
scala> st.concat(" sare")
res47: String = hello sare
scala> st.length
res48: Int = 5
// String interpolation - injecting value
s or f prefix needed
scala> val name = "Jose"
name: String = Jose
// s as prefix
scala> val greet = s"Hello ${name}"
greet: String = Hello Jose
scala> val greet = s"Hello $name"
greet: String = Hello Jose
scala> print(s"Hello $name")
Hello Jose
scala> val greet = s"Hello $name"
greet: String = Hello Jose
//f as prefix
scala> val greet = f"Helo $name"
greet: String = Helo Jose
scala> val greet ="hello $name"
greet: String = hello $name
//printf - similar to c
scala> printf("A string %s, an integer %d, a float %f","siva",5,5.5f)
A string siva, an integer 5, a float 5.500000
String functions:
----------------
scala> val st = "This is a long string"
st: String = This is a long string
scala> st.charAt(3)
res55: Char = s
scala> st.charAt(3)
res56: Char = s
scala> st.indexOf("a")
res57: Int = 8
scala> st.slice(0,4)
res58: String = This
scala> st slice (0,4)
res59: String = This
scala> st.slice(10,14)
res60: String = long
Regular Expression / Pattern Matching:
--------------------------------------
scala> val st = "Aravinda De Silva"
st: String = Aravinda De Silva
scala> st.matches("Aravinda De Silva")
res61: Boolean = true
scala> st matches "Aravinda"
res62: Boolean = false
scala> st.contains("Aravinda")
res63: Boolean = true
scala> !st.contains("Aravinda")
res64: Boolean = false
Tuples :
used to hold order sequence of values of multiple data types
scala> (1,2.2,"hello")
res68: (Int, Double, String) = (1,2.2,hello)
scala> val my_tup = (1,2,"hello",23.2,true)
my_tup: (Int, Int, String, Double, Boolean) = (1,2,hello,23.2,true)
scala> (3,1,(2,3))
res69: (Int, Int, (Int, Int)) = (3,1,(2,3))
scala> my_tup._3
res70: String = hello
scala> my_tup._5
res71: Boolean = true
Assessment:
--------------
// what is 2 to the power of 5?
scala> math.pow(2,5)
res72: Double = 32.0
//what is the remainder of 180 divided by 7?
scala> 180 % 7
res73: Int = 5
// Given the variable pet_name = "Sammy", use string interpolation to printout
"My dog's name is Sammy"
scala> val pet_name = "Sammy"
pet_name: String = Sammy
scala> println(s"My dog's name is $pet_name")
My dog's name is Sammy
scala> println(f"My dog's name is $pet_name")
My dog's name is Sammy
// Use scala to find out if the letter sequence "xyz" is contained in "sadfjhyuxyzfuigjklmhasyysdfk'
scala> val s = "sadfjhyuxyzfuigjklmhasyysdfk"
s: String = sadfjhyuxyzfuigjklmhasyysdfk
scala> if (s contains "xyz") println ("Found") else println("Not found")
Found
scala> val s = "sadfjhyuxzfuigjklmhasyysdfk"
s: String = sadfjhyuxzfuigjklmhasyysdfk
scala> if (s contains "xyz") println ("Found") else println("Not found")
Not found
val vs var:
-----------
val is immutable storage unit meaning you can assign data when you defining it but u can't reassign.
variable is mutable storage unit meaning you can assign and reassign data at any time
// Given the tuple (1,2,3,(4,5,6)) --> retrieve the number 6
scala> val myTuple = (1,2,3,(4,5,6))
myTuple: (Int, Int, Int, (Int, Int, Int)) = (1,2,3,(4,5,6))
scala> myTuple._3_2
<console>:26: error: value _3_2 is not a member of (Int, Int, Int, (Int, Int, Int))
myTuple._3_2
^
scala> myTuple.3
<console>:1: error: ';' expected but double literal found.
myTuple.3
^
scala> myTuple._1
res82: Int = 1
scala> myTuple._2
res83: Int = 2
scala> myTuple._3
res84: Int = 3
scala> myTuple._4
res85: (Int, Int, Int) = (4,5,6)
// while retrieving tuple elements use position number instead of zero indexOf/
// it starts from 1
scala> myTuple._1
res87: Int = 1
scala> myTuple._2
res88: Int = 2
scala> myTuple._3
res89: Int = 3
scala> myTuple._4._1
res90: Int = 4
scala> myTuple._4._2
res91: Int = 5
scala> myTuple._4._3
res92: Int = 6
scala> val t = (1,2,3,(4,5,6))
t: (Int, Int, Int, (Int, Int, Int)) = (1,2,3,(4,5,6))
scala> val nest = t._4
nest: (Int, Int, Int) = (4,5,6)
scala> val result = nest._3
result: Int = 6
scala> t._4._3
res94: Int = 6
List Lession:
-------------
Single Linked List
scala> val evens = List(2,4,6,8,10)
evens: List[Int] = List(2, 4, 6, 8, 10)
scala> val mixedList = List(1,2.3,true,"awesome")
mixedList: List[Any] = List(1, 2.3, true, awesome)
// indexed sequence -- start from 0
scala> evens(0)
res95: Int = 2
scala> evens(4)
res96: Int = 10
scala> evens.head // very first item
res97: Int = 2
scala> evens.tail // except 1st item
res98: List[Int] = List(4, 6, 8, 10)
scala> mixedList.tail
res99: List[Any] = List(2.3, true, awesome)
scala> mixedList.head
res100: Any = 1
scala> val my_list = List(List(1,2,3),List(4,5,6))
my_list: List[List[Int]] = List(List(1, 2, 3), List(4, 5, 6))
scala> my_list(0)
res101: List[Int] = List(1, 2, 3)
scala> my_list(1)
res102: List[Int] = List(4, 5, 6)
scala> my_list(0)(2)
res103: Int = 3
scala> my_list(1)(2)
res104: Int = 6
scala> val my_list = List(("a",1),("b",2),("c",3))
my_list: List[(String, Int)] = List((a,1), (b,2), (c,3))
scala> my_list(0)
res105: (String, Int) = (a,1)
scala> my_list(1)
res106: (String, Int) = (b,2)
scala> my_list(2)
res107: (String, Int) = (c,3)
scala> my_list
res112: List[(String, Int)] = List((a,1), (b,2), (c,3))
scala> val x = my_list(0)
x: (String, Int) = (a,1)
scala> x
res113: (String, Int) = (a,1)
scala> x._1
res114: String = a
scala> x._2
res115: Int = 1
scala> my_list(0)._1
res116: String = a
scala> my_list(0)._2
res117: Int = 1
scala> val myList = List(3,6,1,7,10)
myList: List[Int] = List(3, 6, 1, 7, 10)
scala> myList.sorted
res118: List[Int] = List(1, 3, 6, 7, 10)
scala> myList.size
res119: Int = 5
scala> myList.length
res120: Int = 5
scala> myList.max
res121: Int = 10
scala> myList.sum
res122: Int = 27
scala> myList.min
res123: Int = 1
scala> myList.product
res125: Int = 1260
/// drop the head upto two elements
scala> val x = List(1,2,3,4)
x: List[Int] = List(1, 2, 3, 4)
scala> x.drop(2)
res126: List[Int] = List(3, 4)
scala> val x = List(1,2,3,4)
x: List[Int] = List(1, 2, 3, 4)
scala> x.drop(2)
res126: List[Int] = List(3, 4)
scala> x.takeRight(1)
res127: List[Int] = List(4)
// take 3 elements from the right side of x
scala> x.takeRight(3)
res128: List[Int] = List(2, 3, 4)
scala> val x = List(1,2,3,4,5,6,7,8)
x: List[Int] = List(1, 2, 3, 4, 5, 6, 7, 8)
scala> x.slice(0,3)
res130: List[Int] = List(1, 2, 3)
scala> x.slice(4,7)
res134: List[Int] = List(5, 6, 7)
1 2 3 4 5 6 7 8
------- >> starting position
_____________ >> where it has to end from 0th position
Arrays:
Mutable
Scala developers use Lists often than Arrays
scala> val arr = Array(1,2,3)
arr: Array[Int] = Array(1, 2, 3)
scala> val arr = Array('a','b','c')
arr: Array[Char] = Array(a, b, c)
scala> val arr = Array("Sare","rega","gama")
arr: Array[String] = Array(Sare, rega, gama)
scala> Array.range(0,10)
res135: Array[Int] = Array(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
scala> Array.range(0,10,2)
res136: Array[Int] = Array(0, 2, 4, 6, 8)
scala> Range(0,5)
res137: scala.collection.immutable.Range = Range(0, 1, 2, 3, 4)4
scala> Range(10,100,5)
res139: scala.collection.immutable.Range = Range(10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95)
Set:
----
Set is a collection with No Duplicate Elements
Elements in a Set is Unique
Mutable Sets, Immutable sets
scala> val s = Set()
s: scala.collection.immutable.Set[Nothing] = Set()
scala> val s = Set(1,2,3)
s: scala.collection.immutable.Set[Int] = Set(1, 2, 3)
// my input has multiple duplicate values but after assigning them into a Set
// Set filtered all duplicate values
scala> val s = Set(1,1,1,2,3,4,2,3,5,4,2,3,5,5,2,3,4)
s: scala.collection.immutable.Set[Int] = Set(5, 1, 2, 3, 4)
// Set is Immutable by default.
scala> val s = Set(1,2,3)
s: scala.collection.immutable.Set[Int] = Set(1, 2, 3)
// explicitly call mutable
scala> val s = collection.mutable.Set(1,2,3,4)
s: scala.collection.mutable.Set[Int] = Set(1, 2, 3, 4)
// Immutable set. when we add anything extra, it throws error
scala> val s = Set(1,2,3)
s: scala.collection.immutable.Set[Int] = Set(1, 2, 3)
scala> s += 4
<console>:26: error: value += is not a member of scala.collection.immutable.Set[Int]
s += 4
^
// we explicitly use mutable set and we can anything extra with existing elements
scala> val s = collection.mutable.Set(1,2,3,4)
s: scala.collection.mutable.Set[Int] = Set(1, 2, 3, 4)
scala> s += 5
res143: s.type = Set(1, 5, 2, 3, 4)
scala> val s = collection.mutable.Set(1,2,3,4)
s: scala.collection.mutable.Set[Int] = Set(1, 2, 3, 4)
scala> s += 5 // using += operator to add an extra element to existing Set
res143: s.type = Set(1, 5, 2, 3, 4)
scala> s.add(6) // using add function we add additional element to an existing Set
//New element added successfully after checking duplicate entries
res144: Boolean = true
scala> s.add(5) New element not added after checking duplicate entries (duplicate entry)
res145: Boolean = false
scala> s
res146: scala.collection.mutable.Set[Int] = Set(1, 5, 2, 6, 3, 4)
Set doesn't retain the order sequence
Un Ordered collection of Unique elements
scala> s.max
res149: Int = 6
scala> s.min
res150: Int = 1
/// I have a List with duplicate elements
scala> val myList = List(1,2,3,1,2,3)
myList: List[Int] = List(1, 2, 3, 1, 2, 3)
// Making a new immutable Set with data taken from existing List
scala> val myNewSet = myList.toSet
myNewSet: scala.collection.immutable.Set[Int] = Set(1, 2, 3)
// all duplicate elements removed
scala> myNewSet
res152: scala.collection.immutable.Set[Int] = Set(1, 2, 3)
scala> val myMap = Map(("a",1),("b",2),("c",3),("d",4))
myMap: scala.collection.immutable.Map[String,Int] = Map(a -> 1, b -> 2, c -> 3, d -> 4)
scala> myMap.keys
res153: Iterable[String] = Set(a, b, c, d)
scala> myMap.values
res154: Iterable[Int] = MapLike(1, 2, 3, 4)
scala> myMap("c")
res155: Int = 3
scala> myMap("d")
res156: Int = 4
scala> myMap("e")
java.util.NoSuchElementException: key not found: e
at scala.collection.MapLike$class.default(MapLike.scala:228)
at scala.collection.AbstractMap.default(Map.scala:59)
at scala.collection.MapLike$class.apply(MapLike.scala:141)
at scala.collection.AbstractMap.apply(Map.scala:59)
... 49 elided
scala> myMap.getOrElse("e","Not found")
res158: Any = Not found
scala> myMap get "d"
res159: Option[Int] = Some(4)
scala>
scala> myMap get "e"
res160: Option[Int] = None
// error
scala> myMap getOrElse "e", "Nothing found"
<console>:1: error: ';' expected but ',' found.
myMap getOrElse "e", "Nothing found"
scala> val myMutMap = collection.mutable.Map("x"->1,"y"->2,"z"->3)
myMutMap: scala.collection.mutable.Map[String,Int] = Map(z -> 3, y -> 2, x -> 1)
//adding new element to the existing map
scala> myMutMap += ("a"->0)
res163: myMutMap.type = Map(z -> 3, y -> 2, a -> 0, x -> 1)
scala> myMutMap += ("a"->0)
Assessment:
-------------
1.) Can you figure out what method you can use to find out if the list:
List(1,2,3,4,5) contains the number 3?
scala> val myList = List(1,2,3,4,5)
myList: List[Int] = List(1, 2, 3, 4, 5)
scala> myList.co
collect combinations compose containsSlice copyToArray corresponds
collectFirst companion contains copy copyToBuffer count
myList contains 5
scala> myList.contains
contains containsSlice
scala> myList.contains(5)
res164: Boolean = true
2.) How can you add all the elements of the previous list?
//using Sum
scala> myList.sum
res167: Int = 15
//using reduce
scala> myList.reduce(_+_)
res169: Int = 15
3.) Create an Array of all the odd numbers from 1 to 15
scala> Range(1,15,2)
res172: scala.collection.immutable.Range = Range(1, 3, 5, 7, 9, 11, 13)
scala> Array.range(1,16,2)
res173: Array[Int] = Array(1, 3, 5, 7, 9, 11, 13, 15)
4.) What are the unique elements in the list: List(2,3,1,4,5,6,6,1,2)?
scala> val myList = List(2,3,1,4,5,6,6,1,2)
myList: List[Int] = List(2, 3, 1, 4, 5, 6, 6, 1, 2)
scala> val mySet = myList.toSe
toSeq toSet
scala> val mySet = myList.toSet
mySet: scala.collection.immutable.Set[Int] = Set(5, 1, 6, 2, 3, 4)
Create a mutable map that maps together Names to Ages.
It should have the following key value pairs:
Sammy, 3
Frankie, 7
John, 45
scala> val myMutMap = collection.mutable.Map(("Sammy",3),("Frankie",7),("John",45))
myMutMap: scala.collection.mutable.Map[String,Int] = Map(Sammy -> 3, Frankie -> 7, John -> 45)
Now do the following:
5a) Print out all the keys
scala> myMutMap.keys.foreach(println)
Sammy
Frankie
John
5b) Add the key value pair ("Mike",27)
scala> myMutMap += ("Mike"->27)
res171: myMutMap.type = Map(Sammy -> 3, Mike -> 27, Frankie -> 7, John -> 45)
scala>
Atom IDE
----------
install terminal-plus package
Ctrl + Shift + t or Alt + Shift + t ==> will open Terminal
if (true){
println("I will print if True")
}
scala> :load 1.scala
Loading 1.scala...
I will print if True
if (3 == 3){
println("3 is equal to 3")
}
scala> :load 1.scala
Loading 1.scala...
3 is equal to 3
val x = "Hello"
if (x.endsWith("o")){
println("The value of x ends with o")
}
Result:
scala> :load 1.scala
Loading 1.scala...
x: String = Hello
The value of x ends with o
val x = "zzzz"
if (x.endsWith("o")){
println("The value of x ends with o")
}else
{
println("The value of x doesn\'t end with o")
}
Result:
--------
scala> :load 1.scala
Loading 1.scala...
x: String = zzzz
The value of x doesn't end with o
val person = "George"
if (person == "Sammy"){
println("Welcome Sammy")
}else if (person == "George"){
println("Welcome George")
}else{
println("What is your name?")
}
Result:
scala> :load 1.scala
Loading 1.scala...
person: String = George
Welcome George
println((1 == 2) && (2 == 2))
scala> :load 1.scala
Loading 1.scala...
false
println((1==2) || (2==2))
scala> :load 1.scala
Loading 1.scala...
true
println(!(1 == 2))
scala> :load 1.scala
Loading 1.scala...
true
for(item <- List(1,2,3,4,5)){
println("Hello..")
}
scala> :load 1.scala
Loading 1.scala...
Hello..
Hello..
Hello..
Hello..
Hello..
for(num <- List(1,2,3,4,5)){
println(s"Hello..$num")
}
//string interpolation
scala> :load 1.scala
Loading 1.scala...
Hello..1
Hello..2
Hello..3
Hello..4
Hello..5
for(num <- Array.range(0,5)){
println(s"Hello..$num")
}
scala> :load 1.scala
Loading 1.scala...
Hello..0
Hello..1
Hello..2
Hello..3
Hello..4
//Set will eliminate duplicates
for(num <- Set(1,1,2,2,3,3,4,4,3,3,2,2,1,1,2,3,4,5,6)){
println(s"Hello..$num")
}
scala> :load 1.scala
Loading 1.scala...
Hello..5
Hello..1
Hello..6
Hello..2
Hello..3
Hello..4
for (num <- Range(0,10))
{
if (num % 2 == 0)
{
println(s"$num is even")
}
else
{
println(s"$num is odd")
}
}
Result:
scala> :load 1.scala
Loading 1.scala...
0 is even
1 is odd
2 is even
3 is odd
4 is even
5 is odd
6 is even
7 is odd
8 is even
9 is odd
val names = List("John","Abe","Cindy","Cat")
for (name <- names){
if (name.startsWith("C")){
println(s"$name starts with a C")
}
}
var x = 0
while (x <5 ){
println(s"x is currently $x")
println("x is still less than 5, adding 1 to x")
x += 1
}
scala> :load 1.scala
Loading 1.scala...
names: List[String] = List(John, Abe, Cindy, Cat)
Cindy starts with a C
Cat starts with a C
Result:
scala> :load 1.scala
Loading 1.scala...
x: Int = 0
x is currently 0
x is still less than 5, adding 1 to x
x is currently 1
x is still less than 5, adding 1 to x
x is currently 2
x is still less than 5, adding 1 to x
x is currently 3
x is still less than 5, adding 1 to x
x is currently 4
x is still less than 5, adding 1 to x
Break Example:
--------------
import util.control.Breaks._
var y = 0
while (y < 10){
println(s"y is currently $y")
println("y is still less than 10, adding 1 to y")
y += 1
if (y == 3) break
}
Result:
scala> :load 1.scala
Loading 1.scala...
import util.control.Breaks._
y: Int = 0
y is currently 0
y is still less than 10, adding 1 to y
y is currently 1
y is still less than 10, adding 1 to y
y is currently 2
y is still less than 10, adding 1 to y
scala.util.control.BreakControl
Result:
scala> :load 1.scala
Loading 1.scala...
simple: ()Unit
Simple print : Void function
Function examples:
---------------------
//Function which doesn't return anything void function Unit
def adder(num1:Int, num2:Int): Int = {
return num1 + num2
}
adder(4,5)
Result:
--------
// void function ==> Unit
scala> :load 1.scala
Loading 1.scala...
adder: (num1: Int, num2: Int)Int
res38: Int = 9
//Function which returns a string
def greetName(name:String) : String = {
return(s"Hello $name")
}
val fullgreet = greetName("Jose")
println(fullgreet)
scala> :load 1.scala
Loading 1.scala...
greetName: (name: String)String
fullgreet: String = Hello Jose
Hello Jose
//Prime Number checking using Functions
def isPrime(numcheck:Int) : Boolean = {
for(n <- Range(2,numcheck)){
if (numcheck % n == 0){
return false
}
}
return true
}
println(isPrime(10))
println(isPrime(23))
Result:
-----------
scala> :load 1.scala
Loading 1.scala...
isPrime: (numcheck: Int)Boolean
false
true
// write a function to check a given single value is even ?
def checkEven(num:Int ) : Boolean = {
return (num % 2 == 0)
}
println(checkEven(3))
println(checkEven(4))
Result:
scala> :load 1.scala
Loading 1.scala...
checkEven: (num: Int)Boolean
false
true
//write a function that returns True if there is an even number inside of a List otherwise, return False
def checkEvenInAList(numbers:List[Int]): Boolean = {
for (n <- numbers)
{
if (n % 2 == 0)
{
return true
}
}
return false
}
val evenSample = List(1,2,3,4,5)
val oddSample = List(1,3,5,7,9)
println(checkEvenInAList(evenSample))
println(checkEvenInAList(oddSample))
scala> :load 1.scala
Loading 1.scala...
checkEvenInAList: (numbers: List[Int])Boolean
evenSample: List[Int] = List(1, 2, 3, 4, 5)
oddSample: List[Int] = List(1, 3, 5, 7, 9)
true
false
// sum of the digits but when 7 found add 14 instead of 7
/// if you found 7 in a List while doing sum add 14
def lucky(nums:List[Int]): Int = {
var output = 0
for (n <- nums)
{
if (n == 7)
{
output += 14
}
else
{
output += n
}
}
return output
}
val numbers = List(1,2,3,7)
println(lucky(numbers))
println(lucky(List(1,2,3,4,5)))
println(lucky(List(7,7,7,7,7)))
scala> :load 1.scala
Loading 1.scala...
lucky: (nums: List[Int])Int
numbers: List[Int] = List(1, 2, 3, 7)
20
15
70
Palindrome Check:
----------------
def palindromeCheck(st:String) : Boolean = {
return (st == st.reverse)
}
println(palindromeCheck("abba"))
println(palindromeCheck("hello"))
Result:
scala> :load 1.scala
Loading 1.scala...
palindromeCheck: (st: String)Boolean
true
false
Introduction to Data Frames:
----------------------------
What is Spark DataFrames?
How to use it?
Programming
Exercises
DataFrame is a kind of large excel sheet with rows x columns
It is now standard way of dealing data with Scala with Spark
RDD --> DataFrame syntax
Python Pandas, R
API Documentation - must go throuhg
DataFrame Overview
DataFrame Spark with Scala
df.scala:
---------
import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder().getOrCreate()
val df = spark.read.csv("c://source//CitiGroup2006_2008.txt")
df.head()
scala> df.take(5).foreach(println)
[Date,Open,High,Low,Close,Volume]
[2006-01-03,490.0,493.8,481.1,492.9,1537660]
[2006-01-04,488.6,491.0,483.5,483.8,1871020]
[2006-01-05,484.4,487.8,484.0,486.2,1143160]
[2006-01-06,488.8,489.0,482.0,486.2,1370250]
scala> df.head
res6: org.apache.spark.sql.Row = [Date,Open,High,Low,Close,Volume]
scala> df.schema
res9: org.apache.spark.sql.types.StructType = StructType(StructField(_c0,StringType,true), StructField(_c1,StringType,true), StructField(_c2,StringType,true), StructF
ield(_c4,StringType,true), StructField(_c5,StringType,true))
// Here we didnt mention option("header","true").option("inferSchema',"true")
// so data types for all the fields are string type
scala> df.schema.foreach(println)
StructField(_c0,StringType,true)
StructField(_c1,StringType,true)
StructField(_c2,StringType,true)
StructField(_c3,StringType,true)
StructField(_c4,StringType,true)
StructField(_c5,StringType,true)
scala> df.head(5).foreach(println)
[Date,Open,High,Low,Close,Volume]
[2006-01-03,490.0,493.8,481.1,492.9,1537660]
[2006-01-04,488.6,491.0,483.5,483.8,1871020]
[2006-01-05,484.4,487.8,484.0,486.2,1143160]
[2006-01-06,488.8,489.0,482.0,486.2,1370250]
scala> for(row <- df.head(5)){
| println(row)
| }
[Date,Open,High,Low,Close,Volume]
[2006-01-03,490.0,493.8,481.1,492.9,1537660]
[2006-01-04,488.6,491.0,483.5,483.8,1871020]
[2006-01-05,484.4,487.8,484.0,486.2,1143160]
[2006-01-06,488.8,489.0,482.0,486.2,1370250]
scala> val df = spark.read.option("header","true").csv("c:\\source\\CitiGroup2006_2008.txt")
df: org.apache.spark.sql.DataFrame = [Date: string, Open: string ... 4 more fields]
scala> df.take(5).foreach(println)
[2006-01-03,490.0,493.8,481.1,492.9,1537660]
[2006-01-04,488.6,491.0,483.5,483.8,1871020]
[2006-01-05,484.4,487.8,484.0,486.2,1143160]
[2006-01-06,488.8,489.0,482.0,486.2,1370250]
[2006-01-09,486.0,487.4,483.0,483.9,1680740]
scala> df.schema
res16: org.apache.spark.sql.types.StructType = StructType(StructField(Date,StringType,true), StructField(Open,StringType,true), StructField(High,StringType,true), S
uctField(Close,StringType,true), StructField(Volume,StringType,true))
// we didnt put .option("header","true").option("inferSchema","true")
so data type for each fields are String here
scala> df.schema.foreach(println)
StructField(Date,StringType,true)
StructField(Open,StringType,true)
StructField(High,StringType,true)
StructField(Low,StringType,true)
StructField(Close,StringType,true)
StructField(Volume,StringType,true)
scala>
// Here we used .option("header","true").option("inferSchema","true") --
// so, Spark assigned proper data types for each fields
scala> val df = spark.read.option("header","true").option("inferSchema","true").csv("c:\\source\\CitiGroup2006_2008.txt")
df: org.apache.spark.sql.DataFrame = [Date: timestamp, Open: double ... 4 more fields]
scala> df.schema
res18: org.apache.spark.sql.types.StructType = StructType(StructField(Date,TimestampType,true), StructField(Open,DoubleType,true), StructField(High,DoubleType,true)
StructField(Close,DoubleType,true), StructField(Volume,IntegerType,true))
scala> df.schema.foreach(println)
StructField(Date,TimestampType,true)
StructField(Open,DoubleType,true)
StructField(High,DoubleType,true)
StructField(Low,DoubleType,true)
StructField(Close,DoubleType,true)
StructField(Volume,IntegerType,true)
scala> df.take(5).foreach(println)
[2006-01-03 00:00:00.0,490.0,493.8,481.1,492.9,1537660]
[2006-01-04 00:00:00.0,488.6,491.0,483.5,483.8,1871020]
[2006-01-05 00:00:00.0,484.4,487.8,484.0,486.2,1143160]
[2006-01-06 00:00:00.0,488.8,489.0,482.0,486.2,1370250]
[2006-01-09 00:00:00.0,486.0,487.4,483.0,483.9,1680740]
import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder().getOrCreate()
val df = spark.read.option("header","true").option("inferSchema","true").csv("c://source//CitiGroup2006_2008.txt")
// if don't use inferSchema, Spark will treat all the column's data type as String.
df.head(5).foreach(println)
println("Using For Loop...")
for(row <- df.head(5)){
println(row)
}
scala> :load df.scala
Loading df.scala...
import org.apache.spark.sql.SparkSession
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@1640f20f
df: org.apache.spark.sql.DataFrame = [Date: timestamp, Open: double ... 4 more fields]
[2006-01-03 00:00:00.0,490.0,493.8,481.1,492.9,1537660]
[2006-01-04 00:00:00.0,488.6,491.0,483.5,483.8,1871020]
[2006-01-05 00:00:00.0,484.4,487.8,484.0,486.2,1143160]
[2006-01-06 00:00:00.0,488.8,489.0,482.0,486.2,1370250]
[2006-01-09 00:00:00.0,486.0,487.4,483.0,483.9,1680740]
Using For Loop...
[2006-01-03 00:00:00.0,490.0,493.8,481.1,492.9,1537660]
[2006-01-04 00:00:00.0,488.6,491.0,483.5,483.8,1871020]
[2006-01-05 00:00:00.0,484.4,487.8,484.0,486.2,1143160]
[2006-01-06 00:00:00.0,488.8,489.0,482.0,486.2,1370250]
[2006-01-09 00:00:00.0,486.0,487.4,483.0,483.9,1680740]
scala> df.columns
res8: Array[String] = Array(Date, Open, High, Low, Close, Volume)
scala> df.columns.foreach(println)
Date
Open
High
Low
Close
Volume
scala> for(c <- df.columns){
| println(c)
| }
Date
Open
High
Low
Close
Volume
scala> df.describe().show
+-------+------------------+-----------------+------------------+------------------+-----------------+
|summary| Open| High| Low| Close| Volume|
+-------+------------------+-----------------+------------------+------------------+-----------------+
| count| 755| 755| 755| 755| 755|
| mean| 386.0923178807949|390.6590596026489|380.80170860927143| 385.3421456953643|6308596.382781457|
| stddev|149.32301134820133|148.5151130063523|150.53136890891344|149.83310074439177| 8099892.56297633|
| min| 54.4| 55.3| 30.5| 37.7| 632860|
| max| 566.0| 570.0| 555.5| 564.1| 102869289|
+-------+------------------+-----------------+------------------+------------------+-----------------+
scala> df.show
+-------------------+-----+-----+-----+-----+-------+
| Date| Open| High| Low|Close| Volume|
+-------------------+-----+-----+-----+-----+-------+
|2006-01-03 00:00:00|490.0|493.8|481.1|492.9|1537660|
|2006-01-04 00:00:00|488.6|491.0|483.5|483.8|1871020|
|2006-01-05 00:00:00|484.4|487.8|484.0|486.2|1143160|
|2006-01-06 00:00:00|488.8|489.0|482.0|486.2|1370250|
|2006-01-09 00:00:00|486.0|487.4|483.0|483.9|1680740|
|2006-01-10 00:00:00|483.0|485.5|480.8|485.4|1365960|
|2006-01-11 00:00:00|495.8|495.8|485.8|489.8|1684440|
|2006-01-12 00:00:00|491.0|491.0|488.8|490.3|1230060|
|2006-01-13 00:00:00|491.0|491.9|487.3|489.2| 940930|
|2006-01-17 00:00:00|485.1|487.0|482.7|484.3|1237830|
|2006-01-18 00:00:00|484.3|486.7|481.1|483.8|1218910|
|2006-01-19 00:00:00|485.6|485.8|477.0|479.4|1696500|
|2006-01-20 00:00:00|472.1|474.0|456.3|456.9|4781930|
|2006-01-23 00:00:00|460.0|463.8|457.0|460.0|2025500|
|2006-01-24 00:00:00|462.9|463.6|459.9|460.1|2083740|
|2006-01-25 00:00:00|461.4|463.7|460.1|462.3|1591940|
|2006-01-26 00:00:00|465.5|475.5|464.5|470.1|1988600|
|2006-01-27 00:00:00|470.1|473.7|466.0|468.7|1412760|
|2006-01-30 00:00:00|468.7|469.9|466.6|468.2|1057630|
|2006-01-31 00:00:00|468.3|470.5|465.5|465.8|1887280|
+-------------------+-----+-----+-----+-----+-------+
only showing top 20 rows
scala> df.select("Open","High","Low").show
+-----+-----+-----+
| Open| High| Low|
+-----+-----+-----+
|490.0|493.8|481.1|
|488.6|491.0|483.5|
|484.4|487.8|484.0|
|488.8|489.0|482.0|
|486.0|487.4|483.0|
|483.0|485.5|480.8|
|495.8|495.8|485.8|
|491.0|491.0|488.8|
|491.0|491.9|487.3|
|485.1|487.0|482.7|
|484.3|486.7|481.1|
|485.6|485.8|477.0|
|472.1|474.0|456.3|
|460.0|463.8|457.0|
|462.9|463.6|459.9|
|461.4|463.7|460.1|
|465.5|475.5|464.5|
|470.1|473.7|466.0|
|468.7|469.9|466.6|
|468.3|470.5|465.5|
+-----+-----+-----+
only showing top 20 rows
scala> df.schema
res23: org.apache.spark.sql.types.StructType = StructType(StructField(Date,TimestampType,true), StructField(Open,DoubleType,true), StructField(High,DoubleType,true), StructField(Low,DoubleType,true),
StructField(Close,DoubleType,true), StructField(Volume,IntegerType,true))
scala> df.schema.foreach(println)
StructField(Date,TimestampType,true)
StructField(Open,DoubleType,true)
StructField(High,DoubleType,true)
StructField(Low,DoubleType,true)
StructField(Close,DoubleType,true)
StructField(Volume,IntegerType,true)
scala> df.select("Open").show(5)
+-----+
| Open|
+-----+
|490.0|
|488.6|
|484.4|
|488.8|
|486.0|
+-----+
only showing top 5 rows
scala> df.select("Open").take(5)
res29: Array[org.apache.spark.sql.Row] = Array([490.0], [488.6], [484.4], [488.8], [486.0])
scala> df.select("Open").take(5).foreach(println)
[490.0]
[488.6]
[484.4]
[488.8]
[486.0]
scala> df.select($"Date",$"Open",$"Low").show()
+-------------------+-----+-----+
| Date| Open| Low|
+-------------------+-----+-----+
|2006-01-03 00:00:00|490.0|481.1|
|2006-01-04 00:00:00|488.6|483.5|
|2006-01-05 00:00:00|484.4|484.0|
|2006-01-06 00:00:00|488.8|482.0|
|2006-01-09 00:00:00|486.0|483.0|
|2006-01-10 00:00:00|483.0|480.8|
|2006-01-11 00:00:00|495.8|485.8|
|2006-01-12 00:00:00|491.0|488.8|
|2006-01-13 00:00:00|491.0|487.3|
|2006-01-17 00:00:00|485.1|482.7|
|2006-01-18 00:00:00|484.3|481.1|
|2006-01-19 00:00:00|485.6|477.0|
|2006-01-20 00:00:00|472.1|456.3|
|2006-01-23 00:00:00|460.0|457.0|
|2006-01-24 00:00:00|462.9|459.9|
|2006-01-25 00:00:00|461.4|460.1|
|2006-01-26 00:00:00|465.5|464.5|
|2006-01-27 00:00:00|470.1|466.0|
|2006-01-30 00:00:00|468.7|466.6|
|2006-01-31 00:00:00|468.3|465.5|
+-------------------+-----+-----+
only showing top 20 rows
scala> df.select("Date","Open","Low").show()
+-------------------+-----+-----+
| Date| Open| Low|
+-------------------+-----+-----+
|2006-01-03 00:00:00|490.0|481.1|
|2006-01-04 00:00:00|488.6|483.5|
|2006-01-05 00:00:00|484.4|484.0|
|2006-01-06 00:00:00|488.8|482.0|
|2006-01-09 00:00:00|486.0|483.0|
|2006-01-10 00:00:00|483.0|480.8|
|2006-01-11 00:00:00|495.8|485.8|
|2006-01-12 00:00:00|491.0|488.8|
|2006-01-13 00:00:00|491.0|487.3|
|2006-01-17 00:00:00|485.1|482.7|
|2006-01-18 00:00:00|484.3|481.1|
|2006-01-19 00:00:00|485.6|477.0|
|2006-01-20 00:00:00|472.1|456.3|
|2006-01-23 00:00:00|460.0|457.0|
|2006-01-24 00:00:00|462.9|459.9|
|2006-01-25 00:00:00|461.4|460.1|
|2006-01-26 00:00:00|465.5|464.5|
|2006-01-27 00:00:00|470.1|466.0|
|2006-01-30 00:00:00|468.7|466.6|
|2006-01-31 00:00:00|468.3|465.5|
+-------------------+-----+-----+
only showing top 20 rows
//Computed column - new column at run time
scala> df.withColumn("High + Low",df("High") + df("Low")).show()
+-------------------+-----+-----+-----+-----+-------+-----------------+
| Date| Open| High| Low|Close| Volume| High + Low|
+-------------------+-----+-----+-----+-----+-------+-----------------+
|2006-01-03 00:00:00|490.0|493.8|481.1|492.9|1537660|974.9000000000001|
|2006-01-04 00:00:00|488.6|491.0|483.5|483.8|1871020| 974.5|
|2006-01-05 00:00:00|484.4|487.8|484.0|486.2|1143160| 971.8|
|2006-01-06 00:00:00|488.8|489.0|482.0|486.2|1370250| 971.0|
|2006-01-09 00:00:00|486.0|487.4|483.0|483.9|1680740| 970.4|
|2006-01-10 00:00:00|483.0|485.5|480.8|485.4|1365960| 966.3|
|2006-01-11 00:00:00|495.8|495.8|485.8|489.8|1684440| 981.6|
|2006-01-12 00:00:00|491.0|491.0|488.8|490.3|1230060| 979.8|
|2006-01-13 00:00:00|491.0|491.9|487.3|489.2| 940930| 979.2|
|2006-01-17 00:00:00|485.1|487.0|482.7|484.3|1237830| 969.7|
|2006-01-18 00:00:00|484.3|486.7|481.1|483.8|1218910| 967.8|
|2006-01-19 00:00:00|485.6|485.8|477.0|479.4|1696500| 962.8|
|2006-01-20 00:00:00|472.1|474.0|456.3|456.9|4781930| 930.3|
|2006-01-23 00:00:00|460.0|463.8|457.0|460.0|2025500| 920.8|
|2006-01-24 00:00:00|462.9|463.6|459.9|460.1|2083740| 923.5|
|2006-01-25 00:00:00|461.4|463.7|460.1|462.3|1591940| 923.8|
|2006-01-26 00:00:00|465.5|475.5|464.5|470.1|1988600| 940.0|
|2006-01-27 00:00:00|470.1|473.7|466.0|468.7|1412760| 939.7|
|2006-01-30 00:00:00|468.7|469.9|466.6|468.2|1057630| 936.5|
|2006-01-31 00:00:00|468.3|470.5|465.5|465.8|1887280| 936.0|
+-------------------+-----+-----+-----+-----+-------+-----------------+
only showing top 20 rows
scala> val df2 = df.withColumn("High + Low",df("High")+df("Low"))
df2: org.apache.spark.sql.DataFrame = [Date: timestamp, Open: double ... 5 more fields]
scala> df2.select(df2("High + Low").as("HPL")).show() // alias name
+-----------------+
| HPL|
+-----------------+
|974.9000000000001|
| 974.5|
| 971.8|
| 971.0|
| 970.4|
| 966.3|
| 981.6|
| 979.8|
| 979.2|
| 969.7|
| 967.8|
| 962.8|
| 930.3|
| 920.8|
| 923.5|
| 923.8|
| 940.0|
| 939.7|
| 936.5|
| 936.0|
+-----------------+
only showing top 20 rows
PrintSchema:
------------
df: org.apache.spark.sql.DataFrame = [Date: timestamp, Open: double ... 4 more fields]
root
|-- Date: timestamp (nullable = true)
|-- Open: double (nullable = true)
|-- High: double (nullable = true)
|-- Low: double (nullable = true)
|-- Close: double (nullable = true)
|-- Volume: integer (nullable = true)
scala> df.filter($"Close" > 480).show(5) // scala notation
+-------------------+-----+-----+-----+-----+-------+
| Date| Open| High| Low|Close| Volume|
+-------------------+-----+-----+-----+-----+-------+
|2006-01-03 00:00:00|490.0|493.8|481.1|492.9|1537660|
|2006-01-04 00:00:00|488.6|491.0|483.5|483.8|1871020|
|2006-01-05 00:00:00|484.4|487.8|484.0|486.2|1143160|
|2006-01-06 00:00:00|488.8|489.0|482.0|486.2|1370250|
|2006-01-09 00:00:00|486.0|487.4|483.0|483.9|1680740|
+-------------------+-----+-----+-----+-----+-------+
only showing top 5 rows
//sql notation
scala> df.filter("Close > 480").show(5)
+-------------------+-----+-----+-----+-----+-------+
| Date| Open| High| Low|Close| Volume|
+-------------------+-----+-----+-----+-----+-------+
|2006-01-03 00:00:00|490.0|493.8|481.1|492.9|1537660|
|2006-01-04 00:00:00|488.6|491.0|483.5|483.8|1871020|
|2006-01-05 00:00:00|484.4|487.8|484.0|486.2|1143160|
|2006-01-06 00:00:00|488.8|489.0|482.0|486.2|1370250|
|2006-01-09 00:00:00|486.0|487.4|483.0|483.9|1680740|
+-------------------+-----+-----+-----+-----+-------+
only showing top 5 rows
//scala notation
scala> df.filter($"Close" < 480 && $"High" < 480).show(5)
+-------------------+-----+-----+-----+-----+-------+
| Date| Open| High| Low|Close| Volume|
+-------------------+-----+-----+-----+-----+-------+
|2006-01-20 00:00:00|472.1|474.0|456.3|456.9|4781930|
|2006-01-23 00:00:00|460.0|463.8|457.0|460.0|2025500|
|2006-01-24 00:00:00|462.9|463.6|459.9|460.1|2083740|
|2006-01-25 00:00:00|461.4|463.7|460.1|462.3|1591940|
|2006-01-26 00:00:00|465.5|475.5|464.5|470.1|1988600|
+-------------------+-----+-----+-----+-----+-------+
only showing top 5 rows
//sql notation
scala> df.filter("Close < 480 AND High < 480").show(5)
+-------------------+-----+-----+-----+-----+-------+
| Date| Open| High| Low|Close| Volume|
+-------------------+-----+-----+-----+-----+-------+
|2006-01-20 00:00:00|472.1|474.0|456.3|456.9|4781930|
|2006-01-23 00:00:00|460.0|463.8|457.0|460.0|2025500|
|2006-01-24 00:00:00|462.9|463.6|459.9|460.1|2083740|
|2006-01-25 00:00:00|461.4|463.7|460.1|462.3|1591940|
|2006-01-26 00:00:00|465.5|475.5|464.5|470.1|1988600|
+-------------------+-----+-----+-----+-----+-------+
only showing top 5 rows
// collect the result into an object
scala> val CH_Low = df.filter("Close < 480 AND High < 480")
CH_Low: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Date: timestamp, Open: double ... 4 more fields]
// display the collected result
scala> CH_Low.show()
+-------------------+-----+-----+-----+-----+-------+
| Date| Open| High| Low|Close| Volume|
+-------------------+-----+-----+-----+-----+-------+
|2006-01-20 00:00:00|472.1|474.0|456.3|456.9|4781930|
|2006-01-23 00:00:00|460.0|463.8|457.0|460.0|2025500|
|2006-01-24 00:00:00|462.9|463.6|459.9|460.1|2083740|
|2006-01-25 00:00:00|461.4|463.7|460.1|462.3|1591940|
|2006-01-26 00:00:00|465.5|475.5|464.5|470.1|1988600|
|2006-01-27 00:00:00|470.1|473.7|466.0|468.7|1412760|
|2006-01-30 00:00:00|468.7|469.9|466.6|468.2|1057630|
|2006-01-31 00:00:00|468.3|470.5|465.5|465.8|1887280|
|2006-02-01 00:00:00|465.9|467.2|461.1|463.3|1844970|
|2006-02-02 00:00:00|459.0|461.0|451.0|451.8|2325470|
|2006-02-03 00:00:00|450.7|456.1|448.1|450.6|1666510|
|2006-02-06 00:00:00|452.6|456.1|450.9|451.7|1147430|
|2006-02-07 00:00:00|452.0|453.8|450.0|450.5|1207780|
|2006-02-08 00:00:00|453.3|455.3|450.7|453.6|1051370|
|2006-02-09 00:00:00|455.0|461.0|454.3|457.9|1357740|
|2006-02-10 00:00:00|457.0|460.7|452.5|459.6|1272030|
|2006-02-13 00:00:00|460.6|462.3|454.1|456.8|1158300|
|2006-02-14 00:00:00|457.8|462.5|457.1|461.2|1518040|
|2006-02-15 00:00:00|460.4|464.7|457.6|462.5|1700050|
|2006-02-16 00:00:00|463.0|464.4|460.4|464.4|1326000|
+-------------------+-----+-----+-----+-----+-------+
only showing top 20 rows
//collect
scala> val CH_Low = df.filter("Close < 480 AND High < 480").collect()
CH_Low: Array[org.apache.spark.sql.Row] = Array([2006-01-20 00:00:00.0,472.1,474.0,456.3,456.9,4781930], [2006-01-23 00:00:00.0,460.0,463.8,457.0,460.0,2025500], [2006-01-24 00:00:00.0,462.9,463.6,459
.9,460.1,2083740], [2006-01-25 00:00:00.0,461.4,463.7,460.1,462.3,1591940], [2006-01-26 00:00:00.0,465.5,475.5,464.5,470.1,1988600], [2006-01-27 00:00:00.0,470.1,473.7,466.0,468.7,1412760], [2006-01-3
0 00:00:00.0,468.7,469.9,466.6,468.2,1057630], [2006-01-31 00:00:00.0,468.3,470.5,465.5,465.8,1887280], [2006-02-01 00:00:00.0,465.9,467.2,461.1,463.3,1844970], [2006-02-02 00:00:00.0,459.0,461.0,451.
0,451.8,2325470], [2006-02-03 00:00:00.0,450.7,456.1,448.1,450.6,1666510], [2006-02-06 00:00:00.0,452.6,456.1,450.9,451.7,1147430], [2006-02-07 00:00:00.0,452.0,453.8,450.0,450.5,1207780], [2006-02...
//count result
scala> val CH_Low = df.filter("Close < 480 AND High < 480").count
CH_Low: Long = 397
scala> df.filter($"High" == 484.40).show() // look at this == symbol will result failure
<console>:32: error: overloaded method value filter with alternatives:
(func: org.apache.spark.api.java.function.FilterFunction[org.apache.spark.sql.Row])org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] <and>
(func: org.apache.spark.sql.Row => Boolean)org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] <and>
(conditionExpr: String)org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] <and>
(condition: org.apache.spark.sql.Column)org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]
cannot be applied to (Boolean)
df.filter($"High" == 484.40).show()
^
// scala notation need $ symbol as prefix
scala> df.filter($"High" === 484.40).show() // apply === triple equal to symbol to make it work
+-------------------+-----+-----+-----+-----+-------+
| Date| Open| High| Low|Close| Volume|
+-------------------+-----+-----+-----+-----+-------+
|2006-04-27 00:00:00|472.0|484.4|471.5|481.5|2464800|
+-------------------+-----+-----+-----+-----+-------+
// sql notation doesn't need $ symbol
scala> df.filter("High = 484.40").show()
+-------------------+-----+-----+-----+-----+-------+
| Date| Open| High| Low|Close| Volume|
+-------------------+-----+-----+-----+-----+-------+
|2006-04-27 00:00:00|472.0|484.4|471.5|481.5|2464800|
+-------------------+-----+-----+-----+-----+-------+
scala> df.select(corr("High","Low")).show()
+------------------+
| corr(High, Low)|
+------------------+
|0.9992999172726325|
+------------------+
Look into : dataframe functions in apache
Group By and Aggregate functions:
----------------------------------
//sales.csv
import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder().getOrCreate()
val df = spark.read.option("header","true").option("inferSchema","true").csv("c://spark/mysources//sales.csv")
df.printSchema()
import spark.implicits._
//Result
scala> :load df.scala
Loading df.scala...
import org.apache.spark.sql.SparkSession
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@6dd20ec9
df: org.apache.spark.sql.DataFrame = [Company: string, Person: string ... 1 more field]
root
|-- Company: string (nullable = true)
|-- Person: string (nullable = true)
|-- Sales: integer (nullable = true)
import spark.implicits._
scala> df.show(5)
+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
| GOOG| Sam| 200|
| GOOG|Charlie| 120|
| GOOG| Frank| 340|
| MSFT| Tina| 600|
| MSFT| Amy| 124|
+-------+-------+-----+
only showing top 5 rows
scala> df.printSchema
root
|-- Company: string (nullable = true)
|-- Person: string (nullable = true)
|-- Sales: integer (nullable = true)
scala> df.groupBy("Company").count().show()
+-------+-----+
|Company|count|
+-------+-----+
| GOOG| 3|
| FB| 2|
| MSFT| 3|
+-------+-----+
scala> df.groupBy("Company").max().show()
+-------+----------+
|Company|max(Sales)|
+-------+----------+
| GOOG| 340|
| FB| 870|
| MSFT| 600|
+-------+----------+
scala> df.groupBy("Company").min().show()
+-------+----------+
|Company|min(Sales)|
+-------+----------+
| GOOG| 120|
| FB| 350|
| MSFT| 124|
+-------+----------+
scala> df.groupBy("Company").sum().show()
+-------+----------+
|Company|sum(Sales)|
+-------+----------+
| GOOG| 660|
| FB| 1220|
| MSFT| 967|
+-------+----------+
//groupBy means individual Company's sum,max,min
scala> df.select(countDistinct("Sales")).show()
+---------------------+
|count(DISTINCT Sales)|
+---------------------+
| 8|
+---------------------+
scala> df.select(sumDistinct("Sales")).show()
+-------------------+
|sum(DISTINCT Sales)|
+-------------------+
| 2847|
+-------------------+
scala> df.select(sumDistinct("Sales")).show()
+-------------------+
|sum(DISTINCT Sales)|
+-------------------+
| 2847|
+-------------------+
scala> df.select(variance("Sales")).show()
+-----------------+
| var_samp(Sales)|
+-----------------+
|67235.55357142855|
+-----------------+
scala> df.select(stddev("Sales")).show()
+------------------+
|stddev_samp(Sales)|
+------------------+
|259.29819430807567|
+------------------+
scala> df.select(collect_set("Sales")).show()
+--------------------+
| collect_set(Sales)|
+--------------------+
|[350, 340, 870, 1...|
+--------------------+
// Individual company's total sales
scala> df.groupBy("Company").sum().show()
+-------+----------+
|Company|sum(Sales)|
+-------+----------+
| GOOG| 660|
| FB| 1220|
| MSFT| 967|
+-------+----------+
Total = 2847
// all company total sales summed together
scala> df.select(sum("Sales")).show()
+----------+
|sum(Sales)|
+----------+
| 2847|
+----------+
scala> df.show() /// default
+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
| GOOG| Sam| 200|
| GOOG|Charlie| 120|
| GOOG| Frank| 340|
| MSFT| Tina| 600|
| MSFT| Amy| 124|
| MSFT|Vanessa| 243|
| FB| Carl| 870|
| FB| Sarah| 350|
+-------+-------+-----+
// order by sales ascending
scala> df.orderBy("Sales").show()
+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
| GOOG|Charlie| 120|
| MSFT| Amy| 124|
| GOOG| Sam| 200|
| MSFT|Vanessa| 243|
| GOOG| Frank| 340|
| FB| Sarah| 350|
| MSFT| Tina| 600|
| FB| Carl| 870|
+-------+-------+-----+
Order By Descending based on the column:
----------------------------------------
scala> df.orderBy($"Sales".desc).show()
+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
| FB| Carl| 870|
| MSFT| Tina| 600|
| FB| Sarah| 350|
| GOOG| Frank| 340|
| MSFT|Vanessa| 243|
| GOOG| Sam| 200|
| MSFT| Amy| 124|
| GOOG|Charlie| 120|
+-------+-------+-----+
scala> df.orderBy($"Person".desc).show()
+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
| MSFT|Vanessa| 243|
| MSFT| Tina| 600|
| FB| Sarah| 350|
| GOOG| Sam| 200|
| GOOG| Frank| 340|
| GOOG|Charlie| 120|
| FB| Carl| 870|
| MSFT| Amy| 124|
+-------+-------+-----+
NULL manipulation in Spark DataFrames:
--------------------------------------
ContainsNull.csv contents:
----------------------------
Id,Name,Sales
emp1,John,
emp2,,
emp3,,345.0
emp4,Cindy,456.0
Base code:
-------------
import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder().getOrCreate()
val df = spark.read.option("header","true").option("inferSchema","true").csv("c://spark/mysources//containsnull.csv")
df.printSchema()
import spark.implicits._
scala> :load df.scala
Loading df.scala...
import org.apache.spark.sql.SparkSession
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@6dd20ec9
df: org.apache.spark.sql.DataFrame = [Id: string, Name: string ... 1 more field]
root
|-- Id: string (nullable = true)
|-- Name: string (nullable = true)
|-- Sales: double (nullable = true)
import spark.implicits._
// ContainsNull.csv is having some missing data
scala> df.show()
+----+-----+-----+
| Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+
scala> df.na.(Tab)
drop fill replace
scala> df.show()
+----+-----+-----+
| Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+
// drop -- will eliminate a row which has null value of any of it's column
// if none of the column is null then only the row will be present in the output
scala> df.na.drop().show()
+----+-----+-----+
| Id| Name|Sales|
+----+-----+-----+
|emp4|Cindy|456.0|
+----+-----+-----+
//Minimum number of non null valuess
// here we passed an argument 2 which means, Minimum count of Non Null values
scala> df.na.drop(2).show()
+----+-----+-----+
| Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+
scala> df.show();
+----+-----+-----+
| Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+
// Sales column is numeric column - wherever Sales has Null values that will be replaced with 0.0 (Filled with 0.0 for numeric column which has null)
scala> df.na.fill(0.0).show()
+----+-----+-----+
| Id| Name|Sales|
+----+-----+-----+
|emp1| John| 0.0|
|emp2| null| 0.0|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+
scala> df.show()
+----+-----+-----+
| Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+
// Name is a String column - When Name is null (String column) that will be filled with "-NA-" (Not Applicable)
scala> df.na.fill("-NA-").show();
+----+-----+-----+
| Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| -NA-| null|
|emp3| -NA-|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+
//Applicable for Any column
scala> df.na.fill("-Missing Name-").show();
+----+--------------+-----+
| Id| Name|Sales|
+----+--------------+-----+
|emp1| John| null|
|emp2|-Missing Name-| null|
|emp3|-Missing Name-|345.0|
|emp4| Cindy|456.0|
+----+--------------+-----+
// In order to apply Fill operation for a particular column
// Here we specified which column name we want to fill
scala> df.na.fill("-No Name Found-",Array("Name")).show()
+----+---------------+-----+
| Id| Name|Sales|
+----+---------------+-----+
|emp1| John| null|
|emp2|-No Name Found-| null|
|emp3|-No Name Found-|345.0|
|emp4| Cindy|456.0|
+----+---------------+-----+
//Applying fill logic to more than one column at a time -- see here na.fill is applied twice (1st time for string column, 2nd time for numeric column)
scala> df.na.fill("-No Name Found-",Array("Name")).na.fill(0.0,Array("Sales")).show()
+----+---------------+-----+
| Id| Name|Sales|
+----+---------------+-----+
|emp1| John| 0.0|
|emp2|-No Name Found-| 0.0|
|emp3|-No Name Found-|345.0|
|emp4| Cindy|456.0|
+----+---------------+-----+
scala> df.describe().show()
+-------+----+-----+-----------------+
|summary| Id| Name| Sales|
+-------+----+-----+-----------------+
| count| 4| 2| 2|
| mean|null| null| 400.5|
| stddev|null| null|78.48885271170677|
| min|emp1|Cindy| 345.0|
| max|emp4| John| 456.0|
+-------+----+-----+-----------------+
//Applied filler only for String column - "Name"
scala> val df2 = df.na.fill("-No Name Found-",Array("Name"))
df2: org.apache.spark.sql.DataFrame = [Id: string, Name: string ... 1 more field]
// Applied filler only for numeric column - "Sales"
scala> val df3 = df2.na.fill(0.00,Array("Sales"))
df3: org.apache.spark.sql.DataFrame = [Id: string, Name: string ... 1 more field]
scala> df3.show()
+----+---------------+-----+
| Id| Name|Sales|
+----+---------------+-----+
|emp1| John| 0.0|
|emp2|-No Name Found-| 0.0|
|emp3|-No Name Found-|345.0|
|emp4| Cindy|456.0|
+----+---------------+-----+
// back to CitiGroup2006_2008.txt
import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder().getOrCreate()
val df = spark.read.csv("C:\\Spark\\mysources\\CitiGroup2006_2008.txt")
df.printSchema()
scala> :load df.scala
Loading df.scala...
import org.apache.spark.sql.SparkSession
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@6dd20ec9
df: org.apache.spark.sql.DataFrame = [_c0: string, _c1: string ... 4 more fields]
root
|-- _c0: string (nullable = true)
|-- _c1: string (nullable = true)
|-- _c2: string (nullable = true)
|-- _c3: string (nullable = true)
|-- _c4: string (nullable = true)
|-- _c5: string (nullable = true)
// Here all column names are _c0..._c5 and data types are just String...
scala> df.show(5)
+----------+-----+-----+-----+-----+-------+
| _c0| _c1| _c2| _c3| _c4| _c5|
+----------+-----+-----+-----+-----+-------+
| Date| Open| High| Low|Close| Volume|
|2006-01-03|490.0|493.8|481.1|492.9|1537660|
|2006-01-04|488.6|491.0|483.5|483.8|1871020|
|2006-01-05|484.4|487.8|484.0|486.2|1143160|
|2006-01-06|488.8|489.0|482.0|486.2|1370250|
+----------+-----+-----+-----+-----+-------+
only showing top 5 rows
// In the above, we didnt mention inferSchema
import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder().getOrCreate()
val df = spark.read.option("inferSchema","true").option("header","true").csv("C:\\Spark\\mysources\\CitiGroup2006_2008.txt")
df.printSchema()
scala> :load df.scala
Loading df.scala...
import org.apache.spark.sql.SparkSession
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@6dd20ec9
df: org.apache.spark.sql.DataFrame = [Date: timestamp, Open: double ... 4 more fields]
root
|-- Date: timestamp (nullable = true)
|-- Open: double (nullable = true)
|-- High: double (nullable = true)
|-- Low: double (nullable = true)
|-- Close: double (nullable = true)
|-- Volume: integer (nullable = true)
//Here column names are proper and data types are proper because we used .option("inferSchema","true")
scala> df.show(5)
+-------------------+-----+-----+-----+-----+-------+
| Date| Open| High| Low|Close| Volume|
+-------------------+-----+-----+-----+-----+-------+
|2006-01-03 00:00:00|490.0|493.8|481.1|492.9|1537660|
|2006-01-04 00:00:00|488.6|491.0|483.5|483.8|1871020|
|2006-01-05 00:00:00|484.4|487.8|484.0|486.2|1143160|
|2006-01-06 00:00:00|488.8|489.0|482.0|486.2|1370250|
|2006-01-09 00:00:00|486.0|487.4|483.0|483.9|1680740|
+-------------------+-----+-----+-----+-----+-------+
only showing top 5 rows
// Date Time Extractions
scala> df.select(year(df("Date"))).show(3)
+----------+
|year(Date)|
+----------+
| 2006|
| 2006|
| 2006|
+----------+
only showing top 3 rows
scala> df.select(month(df("Date"))).show(3)
+-----------+
|month(Date)|
+-----------+
| 1|
| 1|
| 1|
+-----------+
only showing top 3 rows
// Adding new column using withColumn - Here we extract Year from Date field and pass the values to Year column (dynamic column)
scala> val df2 = df.withColumn("Year",year(df("Date")))
df2: org.apache.spark.sql.DataFrame = [Date: timestamp, Open: double ... 5 more fields]
scala> df2.show(2) // Look @ here .. Year column is dynamic column based on Date Value
+-------------------+-----+-----+-----+-----+-------+----+
| Date| Open| High| Low|Close| Volume|Year|
+-------------------+-----+-----+-----+-----+-------+----+
|2006-01-03 00:00:00|490.0|493.8|481.1|492.9|1537660|2006|
|2006-01-04 00:00:00|488.6|491.0|483.5|483.8|1871020|2006|
+-------------------+-----+-----+-----+-----+-------+----+
only showing top 2 rows
// we apply grouping operation for each Year
scala> val dfavgs = df2.groupBy("Year").mean()
dfavgs: org.apache.spark.sql.DataFrame = [Year: int, avg(Open): double ... 5 more fields]
scala> dfavgs.show(5)
+----+------------------+-----------------+------------------+------------------+--------------------+---------+
|Year| avg(Open)| avg(High)| avg(Low)| avg(Close)| avg(Volume)|avg(Year)|
+----+------------------+-----------------+------------------+------------------+--------------------+---------+
|2007| 478.8549800796812|483.7444223107569|472.82892430278906| 477.8203984063745| 4107307.721115538| 2007.0|
|2006|489.29402390438236|492.4109163346613|486.12868525896414| 489.2697211155379| 1544542.6294820716| 2006.0|
|2008|191.67707509881424|197.3620553359684| 185.007628458498|190.48893280632404|1.3218876802371541E7| 2008.0|
+----+------------------+-----------------+------------------+------------------+--------------------+---------+
//Instead of displaying all the fields, just we want to display Year and Closing Stock:
scala> dfavgs.select($"Year",$"avg(Close)").show()
+----+------------------+
|Year| avg(Close)|
+----+------------------+
|2007| 477.8203984063745|
|2006| 489.2697211155379|
|2008|190.48893280632404|
+----+------------------+
scala> df2.groupBy("Year").min().show()
+----+---------+---------+--------+----------+-----------+---------+
|Year|min(Open)|min(High)|min(Low)|min(Close)|min(Volume)|min(Year)|
+----+---------+---------+--------+----------+-----------+---------+
|2007| 291.4| 296.9| 288.0| 292.9| 1005203| 2007|
|2006| 450.7| 453.8| 448.1| 450.5| 632860| 2006|
|2008| 54.4| 55.3| 30.5| 37.7| 4007266| 2008|
+----+---------+---------+--------+----------+-----------+---------+
scala> df2.groupBy("Year").max().show()
+----+---------+---------+--------+----------+-----------+---------+
|Year|max(Open)|max(High)|max(Low)|max(Close)|max(Volume)|max(Year)|
+----+---------+---------+--------+----------+-----------+---------+
|2007| 556.6| 562.8| 548.5| 552.5| 23018947| 2007|
|2006| 566.0| 570.0| 555.5| 564.1| 5446320| 2006|
|2008| 297.3| 298.9| 289.1| 296.9| 102869289| 2008|
+----+---------+---------+--------+----------+-----------+---------+
scala> df2.groupBy("Year").avg().show()
+----+------------------+-----------------+------------------+------------------+--------------------+---------+
|Year| avg(Open)| avg(High)| avg(Low)| avg(Close)| avg(Volume)|avg(Year)|
+----+------------------+-----------------+------------------+------------------+--------------------+---------+
|2007| 478.8549800796812|483.7444223107569|472.82892430278906| 477.8203984063745| 4107307.721115538| 2007.0|
|2006|489.29402390438236|492.4109163346613|486.12868525896414| 489.2697211155379| 1544542.6294820716| 2006.0|
|2008|191.67707509881424|197.3620553359684| 185.007628458498|190.48893280632404|1.3218876802371541E7| 2008.0|
+----+------------------+-----------------+------------------+------------------+--------------------+---------+
import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder().getOrCreate()
val df = spark.read.option("inferSchema","true").option("header","true").csv("C:\\Spark\\mysources\\CitiGroup2006_2008.txt")
df.printSchema()
// Here there is No such columnn named as : Year
scala> df.show(3)
+-------------------+-----+-----+-----+-----+-------+
| Date| Open| High| Low|Close| Volume|
+-------------------+-----+-----+-----+-----+-------+
|2006-01-03 00:00:00|490.0|493.8|481.1|492.9|1537660|
|2006-01-04 00:00:00|488.6|491.0|483.5|483.8|1871020|
|2006-01-05 00:00:00|484.4|487.8|484.0|486.2|1143160|
+-------------------+-----+-----+-----+-----+-------+
only showing top 3 rows
// Adding new column using withColumn - Here we extract Year from Date field and pass the values to Year column (dynamic column)
scala> val df2 = df.withColumn("Year",year(df("Date")))
df2: org.apache.spark.sql.DataFrame = [Date: timestamp, Open: double ... 5 more fields]
// Here Year column is constructed
scala> df2.show(3)
+-------------------+-----+-----+-----+-----+-------+----+
| Date| Open| High| Low|Close| Volume|Year|
+-------------------+-----+-----+-----+-----+-------+----+
|2006-01-03 00:00:00|490.0|493.8|481.1|492.9|1537660|2006|
|2006-01-04 00:00:00|488.6|491.0|483.5|483.8|1871020|2006|
|2006-01-05 00:00:00|484.4|487.8|484.0|486.2|1143160|2006|
+-------------------+-----+-----+-----+-----+-------+----+
only showing top 3 rows
// Here based on Year (newly constructed dynamic column based on Date Field) we do grouping operation to find minimum for each fields
scala> val dfmins = df2.groupBy("Year").min()
dfmins: org.apache.spark.sql.DataFrame = [Year: int, min(Open): double ... 5 more fields]
scala> dfmins.show(3)
+----+---------+---------+--------+----------+-----------+---------+
|Year|min(Open)|min(High)|min(Low)|min(Close)|min(Volume)|min(Year)|
+----+---------+---------+--------+----------+-----------+---------+
|2007| 291.4| 296.9| 288.0| 292.9| 1005203| 2007|
|2006| 450.7| 453.8| 448.1| 450.5| 632860| 2006|
|2008| 54.4| 55.3| 30.5| 37.7| 4007266| 2008|
+----+---------+---------+--------+----------+-----------+---------+
// instead of displaying all the fields like above, here we display particular column : min(Close) along with Year column
scala> dfmins.select($"Year",$"min(Close)").show()
+----+----------+
|Year|min(Close)|
+----+----------+
|2007| 292.9|
|2006| 450.5|
|2008| 37.7|
+----+----------+
Tasks:
------
#1. STart a simple Spark Session
#2. Load the Netflix Stock CSV file, have spark infer the data types
#3. What are the column names?
#4. What does the Schema Look like?
#5. Print out the first 5 columns.
#6. Use describe() to Learn about the DataFrame
#7. Create a new dataframe with a colum called HVRatio that is the ratio of
the High Price versus volume of stock traded for a day.
#8. What day had the Peak High in Price?
#9. What is the mean of the Close column?
#10. What is the max and min of the Volume Column?
For Scala/ Spark $ syntax:
#1. How many days was the Close Lower than $600?
#2. What percentage of the time was the High Greater than $500?
#3. What is the Perason Correlation between High and Volume?
#4. What is the max High Per Year?
#5. What is the average Close for each Calendar Month?
import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder().getOrCreate()
val df = spark.read.option("header","true").option("inferSchema","true").csv("C:\\Spark\\mysources\\Netflix_2011_2016.csv")
df.columns
scala> :load df.scala
Loading df.scala...
import org.apache.spark.sql.SparkSession
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@1640f20f
2018-10-27 18:43:29 WARN ObjectStore:568 - Failed to get database global_temp, returning NoSuchObjectException
df: org.apache.spark.sql.DataFrame = [Date: timestamp, Open: double ... 5 more fields]
res0: Array[String] = Array(Date, Open, High, Low, Close, Volume, Adj Close)
scala> for(c <- df.columns){
| println(c)
| }
Date
Open
High
Low
Close
Volume
Adj Close
scala> df.show(5)
+-------------------+----------+------------------+----------+-----------------+---------+------------------+
| Date| Open| High| Low| Close| Volume| Adj Close|
+-------------------+----------+------------------+----------+-----------------+---------+------------------+
|2011-10-24 00:00:00|119.100002|120.28000300000001|115.100004| 118.839996|120460200| 16.977142|
|2011-10-25 00:00:00| 74.899999| 79.390001| 74.249997| 77.370002|315541800|11.052857000000001|
|2011-10-26 00:00:00| 78.73| 81.420001| 75.399997| 79.400002|148733900| 11.342857|
|2011-10-27 00:00:00| 82.179998| 82.71999699999999| 79.249998|80.86000200000001| 71190000|11.551428999999999|
|2011-10-28 00:00:00| 80.280002| 84.660002| 79.599999|84.14000300000001| 57769600| 12.02|
+-------------------+----------+------------------+----------+-----------------+---------+------------------+
scala> df.printSchema
root
|-- Date: timestamp (nullable = true)
|-- Open: double (nullable = true)
|-- High: double (nullable = true)
|-- Low: double (nullable = true)
|-- Close: double (nullable = true)
|-- Volume: integer (nullable = true)
|-- Adj Close: double (nullable = true)
scala> df.head(5)
res7: Array[org.apache.spark.sql.Row] = Array([2011-10-24 00:00:00.0,119.100002,120.28000300000001,115.100004,118.839996,120460200,16.977142], [2011-10-25 00:00:00.0,74.899999,79.390001,74.249997,77.3
70002,315541800,11.052857000000001], [2011-10-26 00:00:00.0,78.73,81.420001,75.399997,79.400002,148733900,11.342857], [2011-10-27 00:00:00.0,82.179998,82.71999699999999,79.249998,80.86000200000001,711
90000,11.551428999999999], [2011-10-28 00:00:00.0,80.280002,84.660002,79.599999,84.14000300000001,57769600,12.02])
scala> df.head(5).foreach(println)
[2011-10-24 00:00:00.0,119.100002,120.28000300000001,115.100004,118.839996,120460200,16.977142]
[2011-10-25 00:00:00.0,74.899999,79.390001,74.249997,77.370002,315541800,11.052857000000001]
[2011-10-26 00:00:00.0,78.73,81.420001,75.399997,79.400002,148733900,11.342857]
[2011-10-27 00:00:00.0,82.179998,82.71999699999999,79.249998,80.86000200000001,71190000,11.551428999999999]
[2011-10-28 00:00:00.0,80.280002,84.660002,79.599999,84.14000300000001,57769600,12.02]
scala> for(row <- df.head(5)){
| println(row)
| }
[2011-10-24 00:00:00.0,119.100002,120.28000300000001,115.100004,118.839996,120460200,16.977142]
[2011-10-25 00:00:00.0,74.899999,79.390001,74.249997,77.370002,315541800,11.052857000000001]
[2011-10-26 00:00:00.0,78.73,81.420001,75.399997,79.400002,148733900,11.342857]
[2011-10-27 00:00:00.0,82.179998,82.71999699999999,79.249998,80.86000200000001,71190000,11.551428999999999]
[2011-10-28 00:00:00.0,80.280002,84.660002,79.599999,84.14000300000001,57769600,12.02]
scala> df.describe().show()
+-------+------------------+------------------+------------------+------------------+--------------------+------------------+
|summary| Open| High| Low| Close| Volume| Adj Close|
+-------+------------------+------------------+------------------+------------------+--------------------+------------------+
| count| 1259| 1259| 1259| 1259| 1259| 1259|
| mean|230.39351086656092|233.97320872915006|226.80127876251044| 230.522453845909|2.5634836060365368E7|55.610540036536875|
| stddev|164.37456353264244| 165.9705082667129| 162.6506358235739|164.40918905512854| 2.306312683388607E7|35.186669331525486|
| min| 53.990001| 55.480001| 52.81| 53.8| 3531300| 7.685714|
| max| 708.900017| 716.159996| 697.569984| 707.610001| 315541800| 130.929993|
+-------+------------------+------------------+------------------+------------------+--------------------+------------------+
scala> val df2 = df.withColumn("HV Ratio",df("High") / df("Volume"))
df2: org.apache.spark.sql.DataFrame = [Date: timestamp, Open: double ... 6 more fields]
scala> df2.columns
res12: Array[String] = Array(Date, Open, High, Low, Close, Volume, Adj Close, HV Ratio)
scala> df2.show(5)
+-------------------+----------+------------------+----------+-----------------+---------+------------------+--------------------+
| Date| Open| High| Low| Close| Volume| Adj Close| HV Ratio|
+-------------------+----------+------------------+----------+-----------------+---------+------------------+--------------------+
|2011-10-24 00:00:00|119.100002|120.28000300000001|115.100004| 118.839996|120460200| 16.977142|9.985040951285156E-7|
|2011-10-25 00:00:00| 74.899999| 79.390001| 74.249997| 77.370002|315541800|11.052857000000001|2.515989989281927E-7|
|2011-10-26 00:00:00| 78.73| 81.420001| 75.399997| 79.400002|148733900| 11.342857|5.474206014903126E-7|
|2011-10-27 00:00:00| 82.179998| 82.71999699999999| 79.249998|80.86000200000001| 71190000|11.551428999999999|1.161960907430818...|
|2011-10-28 00:00:00| 80.280002| 84.660002| 79.599999|84.14000300000001| 57769600| 12.02|1.465476686700271...|
+-------------------+----------+------------------+----------+-----------------+---------+------------------+--------------------+
only showing top 5 rows
scala> df.head(1)
res14: Array[org.apache.spark.sql.Row] = Array([2011-10-24 00:00:00.0,119.100002,120.28000300000001,115.100004,118.839996,120460200,16.977142])
scala> df.orderBy($"High".desc).show(1)
+-------------------+-----------------+----------+----------+----------+--------+------------------+
| Date| Open| High| Low| Close| Volume| Adj Close|
+-------------------+-----------------+----------+----------+----------+--------+------------------+
|2015-07-13 00:00:00|686.6900019999999|716.159996|686.550026|707.610001|33205200|101.08714300000001|
+-------------------+-----------------+----------+----------+----------+--------+------------------+
only showing top 1 row
scala> df.select(mean("Close")).show()
+----------------+
| avg(Close)|
+----------------+
|230.522453845909|
+----------------+
scala> df.select(max("Volume")).show()
+-----------+
|max(Volume)|
+-----------+
| 315541800|
+-----------+
scala> df.select(min("Volume")).show()
+-----------+
|min(Volume)|
+-----------+
| 3531300|
+-----------+
import spark.implicits._
if you want to use scala / spark syntax for filter commands or sql syntax we have to include the above line
It allows us to access scala syntax for filtering
scala> df.filter($"Close" < 600 ).count()
res19: Long = 1218
scala> df.filter("Close < 600").count()
res20: Long = 1218
scala> (df.filter($"High" > 500).count() / df.count() ) * 100
res21: Long = 0
scala>
scala> (df.filter($"High" > 500).count() * 1.0 / df.count() ) * 100
res22: Double = 4.924543288324067
scala> df.select(corr("High","Volume")).show()
+--------------------+
| corr(High, Volume)|
+--------------------+
|-0.20960233287942157|
+--------------------+
scala> val yeardf = df.withColumn("Year",year(df("Date")))
yeardf: org.apache.spark.sql.DataFrame = [Date: timestamp, Open: double ... 6 more fields]
scala> yeardf.show(3)
+-------------------+----------+------------------+----------+----------+---------+------------------+----+
| Date| Open| High| Low| Close| Volume| Adj Close|Year|
+-------------------+----------+------------------+----------+----------+---------+------------------+----+
|2011-10-24 00:00:00|119.100002|120.28000300000001|115.100004|118.839996|120460200| 16.977142|2011|
|2011-10-25 00:00:00| 74.899999| 79.390001| 74.249997| 77.370002|315541800|11.052857000000001|2011|
|2011-10-26 00:00:00| 78.73| 81.420001| 75.399997| 79.400002|148733900| 11.342857|2011|
+-------------------+----------+------------------+----------+----------+---------+------------------+----+
scala> yearmaxs.select($"Year",$"max(High)").show()
+----+------------------+
|Year| max(High)|
+----+------------------+
|2015| 716.159996|
|2013| 389.159988|
|2014| 489.290024|
|2012| 133.429996|
|2016|129.28999299999998|
|2011|120.28000300000001|
+----+------------------+
scala> val result = yearmaxs.select($"Year",$"max(High)")
result: org.apache.spark.sql.DataFrame = [Year: int, max(High): double]
scala> result.orderBy("Year").show()
+----+------------------+
|Year| max(High)|
+----+------------------+
|2011|120.28000300000001|
|2012| 133.429996|
|2013| 389.159988|
|2014| 489.290024|
|2015| 716.159996|
|2016|129.28999299999998|
+----+------------------+
scala> val monthdf = df.withColumn("Month",month(df("Date")))
monthdf: org.apache.spark.sql.DataFrame = [Date: timestamp, Open: double ... 6 more fields]
scala> val monthavgs = monthdf.select($"Month",$"Close").groupBy("Month").mean()
monthavgs: org.apache.spark.sql.DataFrame = [Month: int, avg(Month): double ... 1 more field]
scala> monthavgs.select($"Month",$"avg(Close)").orderBy("Month").show()
+-----+------------------+
|Month| avg(Close)|
+-----+------------------+
| 1|212.22613874257422|
| 2| 254.1954634020619|
| 3| 249.5825228971963|
| 4|246.97514271428562|
| 5|264.37037614150944|
| 6| 295.1597153490566|
| 7|243.64747528037387|
| 8|195.25599892727263|
| 9|206.09598121568627|
| 10|205.93297300900903|
| 11| 194.3172275445545|
| 12| 199.3700942358491|
+-----+------------------+
----------------------------------------------------
python, R are most popular as Datascience Languages
Language designed for scalability
large datasets
Scala runs on JVM (Java Virtual Machine) it can run anywhere java runs.
It uses both Functional and Object Oriented
Functional:
compute by evaluating functions and minimizing the need to maintin state
Object Oriented:
Programs are structured around data structures and methods rather than actions
Work with SQL:
jdbc supports, querying db
Parallel Processing:
use multiple cores
support for parallel collection
especially important for scaling applications to large data volumes
Apache Spark:
Big data platform written in Scala
Supports Scala, Java, Python, R
Distributed cluster processing
Data Types
Byte, Short, Int, Long,Float, Double
Char
How to start Scala?
start terminal
spark-shell
scala>
scala> val a_int : Int = 3
a_int: Int = 3
scala> val a = 3
a: Int = 3
scala> val b_long : Long = 234234234234234234l
b_long: Long = 234234234234234234
scala> val c_char = 'd'
c_char: Char = d
scala> var d_float = 1.234f
d_float: Float = 1.234
scala> val e_double = 23.234234234234234234234234234
e_double: Double = 23.234234234234233
Collections:
Sequences
Vectors, Streams, Lists, Queues, Strings, Stacks
Sets
HashSet, SortedSet, Treeset, BitSet, ListSet
Maps
HashMaps, SortedMaps, TreeMaps, ListMaps
Mutable (Changeable)
Immutable (Non Changeable)
Scala Arrays:
Indexed Collection of values
Mutable Collection type (changeable)
Arrays are Zero based indexes
scala> val temps = Array(50,51,56,53,40)
temps: Array[Int] = Array(50, 51, 56, 53, 40)
scala> temps(1)
res0: Int = 51
scala> temps(0)
res1: Int = 50
scala> temps.length
res2: Int = 5
scala> temps.size
res3: Int = 5
scala> temps(0) = 5000
scala> temps
res5: Array[Int] = Array(5000, 51, 56, 53, 40)
scala> val temps2 : Array[Int] = new Array[Int](10)
temps2: Array[Int] = Array(0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
Multi Dimensional Arrays:
-------------------------
scala> val temp3 = Array.ofDim[Int](10,10)
temp3: Array[Array[Int]] = Array(Array(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Array(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Array(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Array(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Array(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Array(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Array(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Array(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Array(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Array(0, 0, 0, 0, 0, 0, 0, 0, 0, 0))
scala> val temp3 = Array.ofDim[Int](2,30)
temp3: Array[Array[Int]] = Array(Array(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Array(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0))
scala> val temp3 = Array.ofDim[Int](2,3)
temp3: Array[Array[Int]] = Array(Array(0, 0, 0), Array(0, 0, 0))
Import Array Package:
----------------------
scala> import Array._
import Array._
Concatenating 2 arrays:
-----------------------
scala> temps
res10: Array[Int] = Array(5000, 51, 56, 53, 40)
scala> temps2
res11: Array[Int] = Array(0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
scala> concat(temps,temps2)
res13: Array[Int] = Array(5000, 51, 56, 53, 40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
scala> temps.max
res14: Int = 5000
scala> temps.min
res15: Int = 40
scala> temps.take(5).foreach(println)
5000
51
56
53
40
scala> val vec1 :Vector[Int]=Vector(1,2,3,4,5,6,7,8,9,10)
vec1: Vector[Int] = Vector(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
scala> vec1(2)
res17: Int = 3
scala> val myRange = 1 to 10
myRange: scala.collection.immutable.Range.Inclusive = Range(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
scala> val myRange2 :Range = new Range(1,101,7)
myRange2: Range = Range(1, 8, 15, 22, 29, 36, 43, 50, 57, 64, 71, 78, 85, 92, 99)
//Range with Step value
scala> val myRange2 :Range = new Range(1,101,23)
myRange2: Range = Range(1, 24, 47, 70, 93)
Maps:
--------
scala> val capitals = Map("Argentina" -> "Buenos Aires", "Canada"->"Ottava","Egypt"->"Cairo","Liberia"->"Monrovia","Netherlands"->"Amsterdam","United States"->"Washinton D.C")
capitals: scala.collection.immutable.Map[String,String] = Map(United States -> Washinton D.C, Argentina -> Buenos Aires, Egypt -> Cairo, Canada -> Ottava, Liberia -> Monrovia, Netherlands -> Amsterdam)
scala> capitals
res18: scala.collection.immutable.Map[String,String] = Map(United States -> Washinton D.C, Argentina -> Buenos Aires, Egypt -> Cairo, Canada -> Ottava, Liberia -> Monrovia, Netherlands -> Amsterdam)
scala> capitals.keys
res19: Iterable[String] = Set(United States, Argentina, Egypt, Canada, Liberia, Netherlands)
scala> capitals.values
res20: Iterable[String] = MapLike(Washinton D.C, Buenos Aires, Cairo, Ottava, Monrovia, Amsterdam)
scala> capitals.take(2).foreach(println)
(United States,Washinton D.C)
(Argentina,Buenos Aires)
scala> capitals.keys.foreach(println)
United States
Argentina
Egypt
Canada
Liberia
Netherlands
scala> capitals.values.foreach(println)
Washinton D.C
Buenos Aires
Cairo
Ottava
Monrovia
Amsterdam
Query against existing Maps:
-----------------------------------
MapCollection("Key"):
scala> capitals("Argentina")
res24: String = Buenos Aires
scala> capitals get "Argentina"
res25: Option[String] = Some(Buenos Aires)
scala> capitals get "Egypt"
res26: Option[String] = Some(Cairo)
scala> capitals.get("Egypt")
res27: Option[String] = Some(Cairo)
scala> capitals get ("Mexico")
res28: Option[String] = None
scala> capitals("Canada")
res29: String = Ottava
scala> capitals.contains("Egypt")
res30: Boolean = true
scala> capitals contains "Egypt"
res31: Boolean = true
scala> capitals.getOrElse("China","No capitals found")
res32: String = No capitals found
scala> capitals getOrElse("China","No capitals found")
res33: String = No capitals found
scala> capitals.foreach(println)
(United States,Washinton D.C)
(Argentina,Buenos Aires)
(Egypt,Cairo)
(Canada,Ottava)
(Liberia,Monrovia)
(Netherlands,Amsterdam)
Add new Map (Key,value pair)
scala> capitals + ("India"->"Delhi")
res39: scala.collection.immutable.Map[String,String] = Map(United States -> Washinton D.C, Argentina -> Buenos Aires, Egypt -> Cairo, Canada -> Ottava, India -> Delhi, Liberia -> Monrovia, Netherlands -> Amsterdam)
Remove existing Key,value pair:
scala> capitals - "Liberia"
res40: scala.collection.immutable.Map[String,String] = Map(United States -> Washinton D.C, Argentina -> Buenos Aires, Egypt -> Cairo, Canada -> Ottava, Netherlands -> Amsterdam)
Scala Expressions are computable statements
Arithmetic:
scala> 2+2
res41: Int = 4
scala> 100-80
res42: Int = 20
scala> 4*6
res43: Int = 24
scala> 33/4
res44: Int = 8
scala> 33 % 8
res45: Int = 1
Relational:
scala> 3 > 4
res46: Boolean = false
scala> 3 < 4
res47: Boolean = true
scala> 5 <= 10
res48: Boolean = true
Logical:
scala> (3 > 4) && (5 <= 10)
res49: Boolean = false
scala> (3 > 4) || (5 <= 10)
res50: Boolean = true
scala> !(3 > 4)
res51: Boolean = true
Short hand :
scala> var a = 10
a: Int = 10
scala> var b = 20
b: Int = 20
scala> var c = 30
c: Int = 30
scala> c += a //(c = c + a)
scala> c
res53: Int = 40
scala> c *= a // (c = c*a)
scala> c
res55: Int = 400
Multiple experssions in a single block:
scala> print {
| val a = 10
| val b = 20
| val c = a+b
| c
| }
30
Scala functions:
----------------
scala> def myFunction(a:Int, b:Int) : Int = {
| val c = a*b
| return c
| }
myFunction: (a: Int, b: Int)Int
scala> myFunction(10,20)
res57: Int = 200
scala> myFunction(3,4)
res58: Int = 12
// If a Scala function doesn't return a value, that's procedure
scala> def myProcedure (inStr : String) : Unit = { // Unit means void
| println(inStr)
| }
myProcedure: (inStr: String)Unit
scala> myProcedure("This is a log message")
This is a log message
Class example:
----------------
scala> class location(var latitude:Int, var lat_direction:Char, var longitude:Int, var long_direction:Char, var altitude:Int)
defined class location
scala> val loc1 = new location(45,'N',120,'W',300)
loc1: location = $iwC$$iwC$location@502604a8
scala> loc1.latitude
res64: Int = 45
scala> loc1.longitude
res65: Int = 120
scala> loc1.altitude
res66: Int = 300
Private example:
scala> class myPublicPrivate(val x:Int = 0, val y:Int = 0, private val z:Int = 0)
defined class myPublicPrivate
scala> val myPP = new myPublicPrivate
myPP: myPublicPrivate = $iwC$$iwC$myPublicPrivate@279d59fc
scala> myPP.
asInstanceOf isInstanceOf toString x y
// see there private y is not there that is accessible within that definition only
// function within the class
class Point2D(coord1:Int, coord2:Int){
var a : Int = coord1
var b : Int = coord2
def move(deltaA: Int, deltaB : Int){
a = a + deltaA
b = b + deltaB
}
}
scala> val p1 = new Point2D(10,20)
p1: Point2D = $iwC$$iwC$Point2D@490bfaea
scala> p1.a
res74: Int = 10
scala> p1.b
res75: Int = 20
// calling function here
scala> p1.move(3,8)
scala> p1.a
res77: Int = 13
scala> p1.b
res78: Int = 28
Advantages of parallel processing:
Multiple cores
2 or 4 cores for laptops
servers multiple cores
for loop to process each element -- if we have 10000s of elements - single process is very slow
1. ---------------------------------------------------
2. --------------------------
3. -------------
4. -------
5. ----
6. --
7. -
if we have more number of parallel process, the speed will be awesome
1 x 1000 = 125 x 8
if we have 8 core processor, 1000 process will be shared in 8 core processor so speed is good
Sequntial Collections:
Arrays, Vectors, HashMap, HashSet
Parallel Collection:
ParArray, ParVector, ParHashMap
2 ways of creating parallel collection:
create a parallel collection directly
converting existing sequential collection into parallel collection
scala> val rng100 = 1 to 100
rng100: scala.collection.immutable.Range.Inclusive = Range(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100)
// converting sequential range into parallel collecction using .par
scala> val prng100 = rng100.par
prng100: scala.collection.parallel.immutable.ParRange = ParRange(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100)
scala> import scala.collection.parallel.immutable.ParVector
import scala.collection.parallel.immutable.ParVector
scala> val pvec200 = ParVector.range(0,200)
pvec200: scala.collection.parallel.immutable.ParVector[Int] = ParVector(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, ...
Functions over parallel collection:
-------------------------------------
scala> val v = (1 to 100).toArray
v: Array[Int] = Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100)
scala> val pv = v.par
pv: scala.collection.parallel.mutable.ParArray[Int] = ParArray(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100)
To apply an operation to every member of a collection - use Map function
scala> v.map (x => x * 2)
res79: Array[Int] = Array(2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 144, 146, 148, 150, 152, 154, 156, 158, 160, 162, 164, 166, 168, 170, 172, 174, 176, 178, 180, 182, 184, 186, 188, 190, 192, 194, 196, 198, 200)
scala> pv.map (x => x * 2)
res80: scala.collection.parallel.mutable.ParArray[Int] = ParArray(2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 144, 146, 148, 150, 152, 154, 156, 158, 160, 162, 164, 166, 168, 170, 172, 174, 176, 178, 180, 182, 184, 186, 188, 190, 192, 194, 196, 198, 200)
//short hand notation
scala> v.map (_ * 2)
res81: Array[Int] = Array(2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 144, 146, 148, 150, 152, 154, 156, 158, 160, 162, 164, 166, 168, 170, 172, 174, 176, 178, 180, 182, 184, 186, 188, 190, 192, 194, 196, 198, 200)
scala> pv.map (_ * 2)
res82: scala.collection.parallel.mutable.ParArray[Int] = ParArray(2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 144, 146, 148, 150, 152, 154, 156, 158, 160, 162, 164, 166, 168, 170, 172, 174, 176, 178, 180, 182, 184, 186, 188, 190, 192, 194, 196, 198, 200)
//create a function to find the SquareRoot of a given number:
scala> def squareIt(x:Int) : Int = { return x * x }
squareIt: (x: Int)Int
scala> squareIt(4)
res86: Int = 16
// pass each element of a collection to a function:
pv.map (squareIt(_))
res87: scala.collection.parallel.mutable.ParArray[Int] = ParArray(1, 4, 9, 16, 25, 36, 49, 64, 81, 100, 121, 144, 169, 196, 225, 256, 289, 324, 361, 400, 441, 484, 529, 576, 625, 676, 729, 784, 841, 900, 961, 1024, 1089, 1156, 1225, 1296, 1369, 1444, 1521, 1600, 1681, 1764, 1849, 1936, 2025, 2116, 2209, 2304, 2401, 2500, 2601, 2704, 2809, 2916, 3025, 3136, 3249, 3364, 3481, 3600, 3721, 3844, 3969, 4096, 4225, 4356, 4489, 4624, 4761, 4900, 5041, 5184, 5329, 5476, 5625, 5776, 5929, 6084, 6241, 6400, 6561, 6724, 6889, 7056, 7225, 7396, 7569, 7744, 7921, 8100, 8281, 8464, 8649, 8836, 9025, 9216, 9409, 9604, 9801, 10000)
//Applying a filter against existing collection
scala> val v = (1 to 10000).toArray
v: Array[Int] = Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177...
scala> val pv = v.par
pv: scala.collection.parallel.mutable.ParArray[Int] = ParArray(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 16...
scala> v.length
res88: Int = 10000
scala> pv.length
res89: Int = 10000
scala> val pvf = pv.filter ( _ < 30)
pvf: scala.collection.parallel.mutable.ParArray[Int] = ParArray(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29)
scala> val pvf2 = pv.filterNot(_ >30)
pvf2: scala.collection.parallel.mutable.ParArray[Int] = ParArray(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30)
// divisible by 3 and modulus result is zero. (No remainder)
scala> def div3 (x : Int) : Boolean = { val y : Int = (x % 3); return (y == 0)}
div3: (x: Int)Boolean
scala> div3(30)
res90: Boolean = true
scala> div3(10)
res91: Boolean = false
// filter based on divisible by 3 without any remainder
scala> pv.filter(div3(_))
res92: scala.collection.parallel.mutable.ParArray[Int] = ParArray(3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48, 51, 54, 57, 60, 63, 66, 69, 72, 75, 78, 81, 84, 87, 90, 93, 96, 99, 102, 105, 108, 111, 114, 117, 120, 123, 126, 129, 132, 135, 138, 141, 144, 147, 150, 153, 156, 159, 162, 165, 168, 171, 174, 177, 180, 183, 186, 189, 192, 195, 198, 201, 204, 207, 210, 213, 216, 219, 222, 225, 228, 231, 234, 237, 240, 243, 246, 249, 252, 255, 258, 261, 264, 267, 270, 273, 276, 279, 282, 285, 288, 291, 294, 297, 300, 303, 306, 309, 312, 315, 318, 321, 324, 327, 330, 333, 336, 339, 342, 345, 348, 351, 354, 357, 360, 363, 366, 369, 372, 375, 378, 381, 384, 387, 390, 393, 396, 399, 402, 405, 408, 411, 414, 417, 420, 423, 426, 429, 432, 435, 438, 441, 444, 447, 450, 453, 456, 459, 46...
Large Collections:
Collections with at least thousands of elements are good candidates
For some types of collections, converting between Sequential and parallel collections
requires copying the content of collection
Spark:
-------
Scala is a functional programming language as well as Object Oriented language
Client --> Spark Cluster Mangaer
-> Spark Worker
-> Spark Worker
-> Spark Worker
-> Spark Worker
-> Spark Worker
If data cann't be processed within reasanoble amount of time,
you can consider Spark
Fast processing of libraries for analytics
Spark is a distributed processing framework written in Scala
Its faster than hadoop
fault tolerant
Scalable - easy to add nodes
packages for distributed processing for data science
using SQL, and Scala in Spark
RDDs:
------
RDD is a data structured
Immutable (Non changeable) distributed collection
Organized into logical partitions
Fault-tolerant collection
May keep data inMemory or persisted
RDDs like Parallel collection:
Groups of data of the same type or structured
Data processed in parallel
Faster than working with sequential operations
RDDs unlike paralel collection
Parallel collection distributed across multiple cores of a single servers
But RDDs distributed across different nodes
RDDs are partitioned by a hash function
RDD data can be easily persisted to permanent storage
RDD of pairs and four partitions
Partition #1
(I,234)
(Love,23)
Partition #2
(I,3)
(Love,2333)
Partition #3
(I,5)
(Love,2)
Partition #4
(I,1)
(Love,55)
spark-shell : To start Spark
------------------------------
scala> import scala.util.Random
import scala.util.Random
scala> val bigRng = scala.util.Random.shuffle(1 to 100000)
bigRng: scala.collection.immutable.IndexedSeq[Int] = Vector(75517, 19117, 16670, 5788, 98464, 46953, 68961, 70829, 47347, 3494, 21931, 76401, 53320, 81793, 81806, 92847, 92331, 37434, 25262, 57372, 31731, 89471, 3585, 4939, 54946, 47154, 90634, 83196, 80534, 43886, 69999, 44490, 78480, 6417, 47317, 5963, 53122, 34201, 49808, 86023, 58648, 71465, 17459, 95334, 54594, 63940, 33259, 50087, 65084, 35736, 48027, 52327, 23394, 289, 12089, 76269, 73685, 54280, 12056, 90367, 16043, 25023, 15909, 67828, 58435, 49570, 30727, 94080, 50807, 62392, 30591, 76549, 79434, 17608, 63028, 89444, 56614, 97125, 9234, 48797, 30022, 59483, 67901, 88105, 56325, 50693, 33454, 74157, 5644, 85156, 47871, 10115, 60734, 44745, 90998, 20702, 84057, 70348, 96297, 20607, 90028, 52039, 66872, 23128, 22128, 23674, 90090...
scala> val bigPRng = sc.parallelize(bigRng)
bigPRng: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[0] at parallelize at <console>:36
scala> bigPRng.mean
18/10/22 00:48:53 WARN scheduler.TaskSetManager: Stage 0 contains a task of very large size (392 KB). The maximum recommended task size is 100 KB.
res93: Double = 50000.49999999981
scala> bigPRng.min
18/10/22 00:49:09 WARN scheduler.TaskSetManager: Stage 1 contains a task of very large size (392 KB). The maximum recommended task size is 100 KB.
res94: Int = 1
scala> bigPRng.max
18/10/22 00:49:16 WARN scheduler.TaskSetManager: Stage 2 contains a task of very large size (392 KB). The maximum recommended task size is 100 KB.
res95: Int = 100000
Mapping functions over RDDs:
-----------------------------
scala> import scala.util.Random
import scala.util.Random
scala> val bigRng = scala.util.Random.shuffle(1 to 100000)
bigRng: scala.collection.immutable.IndexedSeq[Int] = Vector(94900, 82840, 23088, 79324, 45716, 52511, 28901, 99502, 57102, 69321, 118, 8435, 67827, 16161, 99828, 5706, 41010, 29631, 97397, 54582, 53211, 8759, 60559, 87901, 89142, 44544, 27200, 59372, 54065, 82242, 90265, 62493, 9051, 36857, 58750, 71653, 21882, 56720, 88417, 67842, 76998, 99431, 36280, 68944, 73516, 17742, 20679, 11902, 33351, 41920, 17634, 97529, 81860, 51711, 51526, 2624, 30716, 92984, 57937, 37631, 47749, 79319, 92863, 12749, 67647, 89273, 27326, 21747, 27900, 69599, 2219, 89430, 69601, 76812, 87770, 2355, 55860, 24989, 20583, 90716, 13810, 84155, 94910, 82770, 72862, 5930, 42856, 2027, 12395, 5098, 93477, 14882, 54175, 41908, 23396, 53610, 76395, 30669, 90002, 93760, 37883, 77257, 50579, 48130, 61385, 71958, 8801, 8...
scala> val bigPRng = sc.parallelize(bigRng)
bigPRng: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[3] at parallelize at <console>:38
scala> bigPRng.take(25)
18/10/22 00:52:20 WARN scheduler.TaskSetManager: Stage 3 contains a task of very large size (392 KB). The maximum recommended task size is 100 KB.
res97: Array[Int] = Array(94900, 82840, 23088, 79324, 45716, 52511, 28901, 99502, 57102, 69321, 118, 8435, 67827, 16161, 99828, 5706, 41010, 29631, 97397, 54582, 53211, 8759, 60559, 87901, 89142)
scala> bigPRng.take(25).foreach(println)
18/10/22 00:52:29 WARN scheduler.TaskSetManager: Stage 4 contains a task of very large size (392 KB). The maximum recommended task size is 100 KB.
94900
82840
23088
79324
45716
52511
28901
99502
57102
69321
118
8435
67827
16161
99828
5706
41010
29631
97397
54582
53211
8759
60559
87901
89142
Apply map to existing RDD:
-------------------------
To apply a function to all elements of a collection
// Map function goes here
scala> val bigPRng2 = bigPRng.map (_ * 2)
bigPRng2: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[5] at map at <console>:40
scala> bigPRng2
res99: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[5] at map at <console>:40
scala> bigPRng2.take(25)
18/10/22 00:54:00 WARN scheduler.TaskSetManager: Stage 5 contains a task of very large size (392 KB). The maximum recommended task size is 100 KB.
res100: Array[Int] = Array(189800, 165680, 46176, 158648, 91432, 105022, 57802, 199004, 114204, 138642, 236, 16870, 135654, 32322, 199656, 11412, 82020, 59262, 194794, 109164, 106422, 17518, 121118, 175802, 178284)
//divisible by 3 function
scala> def div(x:Int) : Boolean = {val y:Int= (x%3); return (y==0)}
div: (x: Int)Boolean
scala> div(3)
res101: Boolean = true
scala> div(4)
res102: Boolean = false
// applying the function for each element of an RDD
scala> val bigBool = bigPRng2.map(div(_))
bigBool: org.apache.spark.rdd.RDD[Boolean] = MapPartitionsRDD[6] at map at <console>:46
scala> bigBool.take(25)
18/10/22 00:55:47 WARN scheduler.TaskSetManager: Stage 6 contains a task of very large size (392 KB). The maximum recommended task size is 100 KB.
res103: Array[Boolean] = Array(false, false, true, false, false, false, false, false, true, true, false, false, true, true, true, true, true, true, false, true, true, false, false, false, true)
//copy janani.txt file into hdfs folder
[cloudera@quickstart ~]$ hdfs dfs -copyFromLocal janani.txt Sparks
/user/cloudera/Sparks/janani.txt
//load text file into RDD
scala> val janani = sc.textFile("/user/cloudera/Sparks/janani.txt")
janani: org.apache.spark.rdd.RDD[String] = /user/cloudera/Sparks/janani.txt MapPartitionsRDD[14] at textFile at <console>:36
// display the first 5 lines
scala> janani.take(5)
res106: Array[String] = Array(Hive Janani, --------------, "", " Transactional and analytical processing", " Hive Vs RDBS")
// filter -- a line contains a word : select
scala> val lineswithSelect = janani.filter(line => line.contains("select"))
lineswithSelect: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[18] at filter at <console>:38
scala> lineswithSelect.take(10).foreach(println)
select * from customers;
select * from customers where address='IN';
select * from customers where address like ('IN');
select name,address from customers where address like ('IN');
select distinct address from customers;
select name,address from customers order by address desc;
select count(*) from customers;
select address,count(*) from customers group by address;
if it is simple select operation without any where and all - no MapReduce
select customers.id,name,product_id,quantity,amount from customers join orders where customers.id = orders.customer_id;
Statistics :
----------------
scala> import scala.util.Random
import scala.util.Random
scala> import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.mllib.stat.Statistics
scala> val bigRng = scala.util.Random.shuffle(1 to 100000)
bigRng: scala.collection.immutable.IndexedSeq[Int] = Vector(91095, 9638, 22561, 85501, 8388, 20441, 61060, 51392, 61561, 16225, 18943, 28188, 35297, 5631, 68146, 81868, 88923, 13958, 48393, 37528, 92786, 86026, 98937, 3848, 74809, 34408, 87600, 45401, 38884, 11989, 98399, 50943, 85832, 33283, 31028, 107, 1353, 81230, 8568, 86177, 86395, 26021, 10646, 91551, 25390, 50433, 30862, 54020, 12103, 41863, 89756, 64726, 73827, 51875, 5899, 93594, 58242, 3345, 22987, 38932, 22453, 21303, 56061, 21774, 40002, 22918, 97989, 20176, 2421, 54921, 79092, 66657, 96301, 48011, 10370, 91323, 3950, 61994, 39769, 10401, 16956, 99915, 28158, 38485, 36461, 82894, 37913, 71965, 7528, 14271, 12690, 3683, 49168, 13790, 49230, 9598, 57461, 94808, 98916, 39452, 99688, 93745, 95728, 13935, 76362, 13121, 48872, 819...
scala> val bigPRNg = sc.parallelize(bigRng)
bigPRNg: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[19] at parallelize at <console>:41
scala> val bigPRng2 = bigPRng.map(_ * 2)
bigPRng2: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[20] at map at <console>:43
Take a sample:
is a subset which randomly selected from a collection (RDD)
scala> val x = bigPRng2.takeSample(true,1000)
18/10/22 23:20:11 WARN scheduler.TaskSetManager: Stage 9 contains a task of very large size (392 KB). The maximum recommended task size is 100 KB.
18/10/22 23:20:11 WARN scheduler.TaskSetManager: Stage 10 contains a task of very large size (392 KB). The maximum recommended task size is 100 KB.
x: Array[Int] = Array(24362, 106660, 11866, 185538, 162800, 34796, 47468, 17746, 3902, 178864, 135996, 92678, 126612, 157908, 157384, 149298, 54496, 21954, 153802, 109340, 125262, 80640, 89536, 129846, 138948, 156946, 81444, 198828, 29086, 123216, 86080, 151846, 16386, 157232, 20104, 146044, 71476, 113090, 148258, 123936, 54004, 93768, 20962, 181584, 53802, 122976, 56908, 4192, 166166, 140072, 88282, 93544, 192026, 135214, 141114, 194096, 176458, 78694, 191316, 149050, 112416, 101952, 113896, 181580, 90830, 108628, 65326, 89968, 59864, 158514, 103282, 85496, 59000, 9818, 86530, 183392, 57588, 149792, 88068, 147584, 100476, 146332, 26370, 12428, 60492, 159344, 75170, 184996, 111140, 132238, 26846, 98002, 31102, 171248, 11978, 106742, 48288, 60194, 148240, 189380, 199310, 163382, 167712, ...
scala> val x = bigPRng2.takeSample(true,1000)
18/10/22 23:20:19 WARN scheduler.TaskSetManager: Stage 11 contains a task of very large size (392 KB). The maximum recommended task size is 100 KB.
18/10/22 23:20:19 WARN scheduler.TaskSetManager: Stage 12 contains a task of very large size (392 KB). The maximum recommended task size is 100 KB.
x: Array[Int] = Array(20192, 58242, 129452, 93558, 67982, 197420, 78984, 12180, 155726, 30872, 143612, 148310, 198260, 29008, 179216, 108096, 129320, 130654, 185524, 137098, 103388, 96788, 96678, 105436, 165528, 26214, 147704, 188060, 91658, 125604, 142896, 42794, 188050, 103120, 67642, 75064, 46154, 183936, 39240, 86554, 188520, 58974, 135966, 73972, 17408, 100440, 143296, 137652, 23436, 76174, 199628, 171200, 167538, 21658, 91320, 114100, 76128, 171194, 18824, 73056, 60082, 69450, 129340, 135164, 97120, 164026, 36182, 166342, 113580, 38292, 55902, 66182, 138902, 195040, 46708, 16080, 142520, 95884, 40954, 95758, 39958, 165690, 62524, 187458, 142712, 51498, 69102, 136192, 4082, 14408, 78294, 128258, 70734, 170414, 52542, 50702, 181996, 68104, 10370, 86974, 185428, 172382, 79684, 55760,...
// here we pass seed value to get the same set of values as result everytime (1234 : seed value)
// if you want to get consistent result u can apply a seed value
scala> val x = bigPRng2.takeSample(true,1000,1234)
18/10/22 23:21:42 WARN scheduler.TaskSetManager: Stage 13 contains a task of very large size (392 KB). The maximum recommended task size is 100 KB.
18/10/22 23:21:42 WARN scheduler.TaskSetManager: Stage 14 contains a task of very large size (392 KB). The maximum recommended task size is 100 KB.
x: Array[Int] = Array(58180, 115670, 20038, 197136, 74802, 168364, 10386, 113546, 133782, 33844, 104378, 63586, 95716, 166390, 26314, 118614, 72380, 148474, 125294, 67858, 168820, 181684, 121832, 64804, 164854, 156202, 49680, 24198, 126430, 31890, 178666, 4868, 19166, 6644, 165826, 61906, 102314, 155714, 46688, 132820, 174446, 164376, 69740, 112170, 100186, 2824, 124978, 61304, 188802, 175104, 77978, 192202, 184244, 91750, 173898, 43122, 44174, 192002, 12776, 198562, 149208, 198382, 70822, 74210, 112034, 126006, 136660, 161938, 192292, 57972, 196212, 119050, 98936, 75076, 184604, 96474, 144724, 15690, 4464, 22368, 142552, 189946, 5654, 100718, 128298, 20642, 55708, 133306, 49888, 50946, 188472, 94296, 121456, 100558, 187052, 41956, 197966, 114804, 2322, 37556, 102686, 192982, 13202, 198...
scala> val x = bigPRng2.takeSample(true,1000,1234)
18/10/22 23:21:46 WARN scheduler.TaskSetManager: Stage 15 contains a task of very large size (392 KB). The maximum recommended task size is 100 KB.
18/10/22 23:21:46 WARN scheduler.TaskSetManager: Stage 16 contains a task of very large size (392 KB). The maximum recommended task size is 100 KB.
x: Array[Int] = Array(58180, 115670, 20038, 197136, 74802, 168364, 10386, 113546, 133782, 33844, 104378, 63586, 95716, 166390, 26314, 118614, 72380, 148474, 125294, 67858, 168820, 181684, 121832, 64804, 164854, 156202, 49680, 24198, 126430, 31890, 178666, 4868, 19166, 6644, 165826, 61906, 102314, 155714, 46688, 132820, 174446, 164376, 69740, 112170, 100186, 2824, 124978, 61304, 188802, 175104, 77978, 192202, 184244, 91750, 173898, 43122, 44174, 192002, 12776, 198562, 149208, 198382, 70822, 74210, 112034, 126006, 136660, 161938, 192292, 57972, 196212, 119050, 98936, 75076, 184604, 96474, 144724, 15690, 4464, 22368, 142552, 189946, 5654, 100718, 128298, 20642, 55708, 133306, 49888, 50946, 188472, 94296, 121456, 100558, 187052, 41956, 197966, 114804, 2322, 37556, 102686, 192982, 13202, 198...
scala> bigPRng2.mean
18/10/22 23:24:23 WARN scheduler.TaskSetManager: Stage 17 contains a task of very large size (392 KB). The maximum recommended task size is 100 KB.
res111: Double = 100000.99999999923
scala> bigPRng2.min
18/10/22 23:24:30 WARN scheduler.TaskSetManager: Stage 18 contains a task of very large size (392 KB). The maximum recommended task size is 100 KB.
res112: Int = 2
scala> bigPRng2.max
18/10/22 23:24:36 WARN scheduler.TaskSetManager: Stage 19 contains a task of very large size (392 KB). The maximum recommended task size is 100 KB.
res113: Int = 200000
scala> bigPRng2.stats
18/10/22 23:25:11 WARN scheduler.TaskSetManager: Stage 20 contains a task of very large size (392 KB). The maximum recommended task size is 100 KB.
res114: org.apache.spark.util.StatCounter = (count: 100000, mean: 100001.000000, stdev: 57735.026916, max: 200000.000000, min: 2.000000)
correlation
if list of 2 numbers has correlation
// Here we have 2 different sets (series#1, series#2) of random values.
// we are going to test the correlation among them
scala> val series1 = Array.fill(100000)(Random.nextDouble)
series1: Array[Double] = Array(0.4003148782904564, 0.07544386592595631, 0.11539508758739403, 0.4963449391212429, 0.977528439465917, 0.9921300387841547, 0.5861702062478198, 0.44703958353366946, 0.6139942287608438, 0.31946501415210815, 0.18219470979346963, 0.6238651900256478, 0.9550761980932683, 0.15960535593688452, 0.376877013561561, 0.6358393625927671, 0.4571433688545875, 0.02722800537743353, 0.82887628633672, 0.7132477355926489, 0.6680269518041401, 0.4550674990144563, 0.8262564059351563, 0.5650623380048277, 0.9343657014854382, 0.25721478202085923, 0.6652338832483775, 0.9297812318583933, 0.07688619567135646, 0.33814753485477145, 0.4034919692040525, 0.34486666022461543, 0.6303277731201168, 0.28018855442225143, 0.12432164634987464, 0.27030992194849635, 0.8316363795164277, 0.06806531178677...
scala> val series2 = Array.fill(100000)(Random.nextDouble)
series2: Array[Double] = Array(0.1391225207098482, 0.2017061373781609, 0.9237840457672266, 0.11569933922540798, 0.9843651584920243, 0.1977110367734588, 0.39617186053922093, 0.12833064029269514, 0.802899598102286, 0.7615134456423496, 0.9234516978762192, 0.09848942336217725, 0.8058362763562198, 0.569129858888236, 0.5502197354285807, 0.4601391309978844, 0.9145954548463847, 0.8559317839507914, 0.3205636979066885, 0.719960846417004, 0.7553303691557286, 0.7425580377590679, 0.5217543510586904, 0.08263214958998488, 0.7142726052500101, 0.917074863011217, 0.3285955316144695, 0.8312946409732378, 0.5716072908667935, 0.10041656125599074, 0.8899673467448076, 0.15245023905445287, 0.5371515823401966, 0.31686529664937346, 0.8453857122617168, 0.9548766579391516, 0.34802289789761975, 0.367306202904618, 0....
//Converting random sets (Series#1, Series#2) into RDDs:
scala> val pseries1 = sc.parallelize(series1)
pseries1: org.apache.spark.rdd.RDD[Double] = ParallelCollectionRDD[29] at parallelize at <console>:41
scala> val pseries2 = sc.parallelize(series2)
pseries2: org.apache.spark.rdd.RDD[Double] = ParallelCollectionRDD[30] at parallelize at <console>:41
// pearson correlation
scala> val myCorrelation:Double = Statistics.corr(pseries1,pseries2,"pearson")
myCorrelation: Double = 0.004358097599783705
scala> val distTest = Statistics.kolmogorovSmirnovTest(pseries1,"norm",0,1)
18/10/22 23:31:39 WARN scheduler.TaskSetManager: Stage 24 contains a task of very large size (783 KB). The maximum recommended task size is 100 KB.
18/10/22 23:31:39 WARN scheduler.TaskSetManager: Stage 25 contains a task of very large size (783 KB). The maximum recommended task size is 100 KB.
distTest: org.apache.spark.mllib.stat.test.KolmogorovSmirnovTestResult =
Kolmogorov-Smirnov test summary:
degrees of freedom = 0
statistic = 0.5000015852249269
pValue = 7.296279136426165E-10
Very strong presumption against null hypothesis: Sample follows theoretical distribution.
DataFrames:
-----------
Data frames are kind of like a relational table:
[cloudera@quickstart ~]$ hdfs dfs -copyFromLocal employee.txt Sparks
scala> val file = sc.textFile("/user/cloudera/Sparks/employee.txt")
file: org.apache.spark.rdd.RDD[String] = /user/cloudera/Sparks/employee.txt MapPartitionsRDD[42] at textFile at <console>:39
scala> file.take(5)
res117: Array[String] = Array(id,last_name,email,gender,department,start_date,salary,job_title,region_id, 1,'Kelley','rkelley0@soundcloud.com','Female','Computers','10/2/2009',67470,'Structural Engineer',2, 2,'Armstrong','sarmstrong1@infoseek.co.jp','Male','Sports','3/31/2008',71869,'Financial Advisor',2, 3,'Carr','fcarr2@woothemes.com','Male','Automotive','7/12/2009',101768,'Recruiting Manager',3, 4,'Murray','jmurray3@gov.uk','Female','Jewelery','12/25/2014',96897,'Desktop Support Technician',3)
Spark 2.0:
----------
scala> spark.read.option("header","true").csv("filename")
C:\Spark\bin>spark-shell
scala> import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.SparkSession
scala> val spark = SparkSession.builder().appName("DataFrameExercise").getOrCreate()
2018-10-23 20:40:41 WARN SparkSession$Builder:66 - Using an existing SparkSessi
on; some configuration may not take effect.
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@606
d6d2c
val df_emps = spark.read.option("header","true").csv("c:\\source\\employee.txt")
df_emps.take(10)
scala> df_emps.take(10).foreach(println)
[1,'Kelley','rkelley0@soundcloud.com','Female','Computers','10/2/2009',67470,'Structural Engineer',2]
[2,'Armstrong','sarmstrong1@infoseek.co.jp','Male','Sports','3/31/2008',71869,'Financial Advisor',2]
[3,'Carr','fcarr2@woothemes.com','Male','Automotive','7/12/2009',101768,'Recruiting Manager',3]
[4,'Murray','jmurray3@gov.uk','Female','Jewelery','12/25/2014',96897,'Desktop Support Technician',3]
[5,'Ellis','jellis4@sciencedirect.com','Female','Grocery','9/19/2002',63702,'Software Engineer III',7]
[6,'Phillips','bphillips5@time.com','Male','Tools','8/21/2013',118497,'Executive Secretary',1]
[7,'Williamson','rwilliamson6@ted.com','Male','Computers','5/14/2006',65889,'Dental Hygienist',6]
[8,'Harris','aharris7@ucoz.com','Female','Toys','8/12/2003',84427,'Safety Technician I',4]
[9,'James','rjames8@prnewswire.com','Male','Jewelery','9/7/2005',108657,'Sales Associate',2]
[10,'Sanchez','rsanchez9@cloudflare.com','Male','Movies','3/13/2013',108093,'Sales Representative',1]
scala> df_emps.schema.foreach(println)
StructField(id,StringType,true)
StructField(last_name,StringType,true)
StructField(email,StringType,true)
StructField(gender,StringType,true)
StructField(department,StringType,true)
StructField(start_date,StringType,true)
StructField(salary,StringType,true)
StructField(job_title,StringType,true)
StructField(region_id,StringType,true)
scala> df_emps.show(5)
+---+-----------+--------------------+--------+------------+------------+------+--------------------+---------+
| id| last_name| email| gender| department| start_date|salary| job_title|region_id|
+---+-----------+--------------------+--------+------------+------------+------+--------------------+---------+
| 1| 'Kelley'|'rkelley0@soundcl...|'Female'| 'Computers'| '10/2/2009'| 67470|'Structural Engin...| 2|
| 2|'Armstrong'|'sarmstrong1@info...| 'Male'| 'Sports'| '3/31/2008'| 71869| 'Financial Advisor'| 2|
| 3| 'Carr'|'fcarr2@woothemes...| 'Male'|'Automotive'| '7/12/2009'|101768|'Recruiting Manager'| 3|
| 4| 'Murray'| 'jmurray3@gov.uk'|'Female'| 'Jewelery'|'12/25/2014'| 96897|'Desktop Support ...| 3|
| 5| 'Ellis'|'jellis4@scienced...|'Female'| 'Grocery'| '9/19/2002'| 63702|'Software Enginee...| 7|
+---+-----------+--------------------+--------+------------+------------+------+--------------------+---------+
only showing top 5 rows
scala> val df_dd = spark.read.option("header","true").csv("c://source//dept_div.txt")
df_dd: org.apache.spark.sql.DataFrame = [department: string, company_division: string]
scala> df_dd.show()
+-------------+--------------------+
| department| company_division|
+-------------+--------------------+
| 'Automotive'| 'Auto & Hardware'|
| 'Baby'| 'Domestic'|
| 'Beauty'| 'Domestic'|
| 'Clothing'| 'Domestic'|
| 'Computers'|'Electronic Equip...|
|'Electronics'|'Electronic Equip...|
| 'Games'| 'Domestic'|
| 'Garden'| 'Outdoors & Garden'|
| 'Grocery'| 'Domestic'|
| 'Health'| 'Domestic'|
| 'Home'| 'Domestic'|
| 'Industrial'| 'Auto & Hardware'|
| 'Jewelery'| 'Fashion'|
| 'Kids'| 'Domestic'|
| 'Movies'| 'Entertainment'|
| 'Music'| 'Entertainment'|
| 'Outdoors'| 'Outdoors & Garden'|
| 'Shoes'| 'Domestic'|
| 'Sports'| 'Games & Sports'|
| 'Tools'| 'Auto & Hardware'|
+-------------+--------------------+
only showing top 20 rows
we can use sql approach to do grouping, filtering operations
scala> df_emps.createOrReplaceTempView("employees")
scala> val sqldf_emps = spark.sql("SELECT * FROM employees")
sqldf_emps: org.apache.spark.sql.DataFrame = [id: string, last_name: string ... 7 more fields]
scala> val sqldf_emps_by_dept = spark.sql("SELECT department, count(*) from employees GROUP BY department")
sqldf_emps_by_dept: org.apache.spark.sql.DataFrame = [department: string, count(1): bigint]
scala> sqldf_emps_by_dept.show()
+-------------+--------+
| department|count(1)|
+-------------+--------+
| 'Clothing'| 53|
| 'Books'| 47|
| 'Garden'| 47|
| 'Baby'| 45|
| 'Beauty'| 53|
| 'Automotive'| 46|
| 'Grocery'| 46|
| 'Home'| 52|
|'Electronics'| 49|
| 'Sports'| 40|
| 'Health'| 46|
| 'Outdoors'| 48|
| 'Kids'| 38|
| 'Tools'| 39|
| 'Music'| 37|
| 'Games'| 49|
| 'Movies'| 36|
| 'Toys'| 41|
| 'Jewelery'| 46|
| 'Computers'| 52|
+-------------+--------+
only showing top 20 rows
scala> val sqldf_emps_by_dept_gender = spark.sql("SELECT department, gender, count(*) FROM employees GROUP BY department, gender")
sqldf_emps_by_dept_gender: org.apache.spark.sql.DataFrame = [department: string, gender: string ... 1 more field]
scala> sqldf_emps_by_dept_gender.show()
+-------------+--------+--------+
| department| gender|count(1)|
+-------------+--------+--------+
| 'Grocery'| 'Male'| 22|
| 'Health'|'Female'| 25|
| 'Health'| 'Male'| 21|
| 'Tools'|'Female'| 21|
| 'Home'| 'Male'| 20|
| 'Kids'| 'Male'| 17|
| 'Movies'| 'Male'| 22|
| 'Toys'| 'Male'| 24|
| 'Computers'|'Female'| 26|
| 'Outdoors'| 'Male'| 20|
| 'Beauty'| 'Male'| 25|
| 'Automotive'| 'Male'| 26|
| 'Books'| 'Male'| 23|
| 'Beauty'|'Female'| 28|
| 'Music'| 'Male'| 18|
| 'Games'| 'Male'| 24|
|'Electronics'|'Female'| 23|
| 'Industrial'| 'Male'| 29|
| 'Jewelery'| 'Male'| 26|
| 'Industrial'|'Female'| 18|
+-------------+--------+--------+
only showing top 20 rows
scala> val sqldf_depts = spark.sql("SELECT DISTINCT department FROM employees")
sqldf_depts: org.apache.spark.sql.DataFrame = [department: string]
scala> sqldf_depts.show()
+-------------+
| department|
+-------------+
| 'Clothing'|
| 'Books'|
| 'Garden'|
| 'Baby'|
| 'Beauty'|
| 'Automotive'|
| 'Grocery'|
| 'Home'|
|'Electronics'|
| 'Sports'|
| 'Health'|
| 'Outdoors'|
| 'Kids'|
| 'Tools'|
| 'Music'|
| 'Games'|
| 'Movies'|
| 'Toys'|
| 'Jewelery'|
| 'Computers'|
+-------------+
only showing top 20 rows
scala> val sqldf_emps_100 = spark.sql("SELECT * FROM employees WHERE id < 100")
sqldf_emps_100: org.apache.spark.sql.DataFrame = [id: string, last_name: string ... 7 more fields]
scala> sqldf_emps_100
res21: org.apache.spark.sql.DataFrame = [id: string, last_name: string ... 7 more fields]
scala> sqldf_emps_100.show
+---+------------+--------------------+--------+-------------+------------+------+--------------------+---------+
| id| last_name| email| gender| department| start_date|salary| job_title|region_id|
+---+------------+--------------------+--------+-------------+------------+------+--------------------+---------+
| 1| 'Kelley'|'rkelley0@soundcl...|'Female'| 'Computers'| '10/2/2009'| 67470|'Structural Engin...| 2|
| 2| 'Armstrong'|'sarmstrong1@info...| 'Male'| 'Sports'| '3/31/2008'| 71869| 'Financial Advisor'| 2|
| 3| 'Carr'|'fcarr2@woothemes...| 'Male'| 'Automotive'| '7/12/2009'|101768|'Recruiting Manager'| 3|
| 4| 'Murray'| 'jmurray3@gov.uk'|'Female'| 'Jewelery'|'12/25/2014'| 96897|'Desktop Support ...| 3|
| 5| 'Ellis'|'jellis4@scienced...|'Female'| 'Grocery'| '9/19/2002'| 63702|'Software Enginee...| 7|
| 6| 'Phillips'|'bphillips5@time....| 'Male'| 'Tools'| '8/21/2013'|118497|'Executive Secret...| 1|
| 7|'Williamson'|'rwilliamson6@ted...| 'Male'| 'Computers'| '5/14/2006'| 65889| 'Dental Hygienist'| 6|
| 8| 'Harris'| 'aharris7@ucoz.com'|'Female'| 'Toys'| '8/12/2003'| 84427|'Safety Technicia...| 4|
| 9| 'James'|'rjames8@prnewswi...| 'Male'| 'Jewelery'| '9/7/2005'|108657| 'Sales Associate'| 2|
| 10| 'Sanchez'|'rsanchez9@cloudf...| 'Male'| 'Movies'| '3/13/2013'|108093|'Sales Representa...| 1|
| 11| 'Jacobs'|'jjacobsa@sbwire....|'Female'| 'Jewelery'|'11/27/2003'|121966|'Community Outrea...| 7|
| 12| 'Black'|'mblackb@edublogs...| 'Male'| 'Clothing'| '2/4/2003'| 44179| 'Data Coordiator'| 7|
| 13| 'Schmidt'|'sschmidtc@state....| 'Male'| 'Baby'|'10/13/2002'| 85227|'Compensation Ana...| 3|
| 14| 'Webb'| 'awebbd@baidu.com'|'Female'| 'Computers'|'10/22/2006'| 59763|'Software Test En...| 4|
| 15| 'Jacobs'|'ajacobse@google.it'|'Female'| 'Games'| '3/4/2007'|141139|'Community Outrea...| 7|
| 16| 'Medina'|'smedinaf@amazona...|'Female'| 'Baby'| '3/14/2008'|106659| 'Web Developer III'| 1|
| 17| 'Morgan'|'dmorgang@123-reg...|'Female'| 'Kids'| '5/4/2011'|148952| 'Programmer IV'| 6|
| 18| 'Nguyen'|'jnguyenh@google....| 'Male'| 'Home'| '11/3/2014'| 93804| 'Geologist II'| 5|
| 19| 'Day'|'rdayi@chronoengi...| 'Male'|'Electronics'| '9/22/2004'|109890| 'VP Sales'| 3|
| 20| 'Carr'| 'dcarrj@ocn.ne.jp'|'Female'| 'Movies'|'11/22/2007'|115274|'VP Quality Control'| 5|
+---+------------+--------------------+--------+-------------+------------+------+--------------------+---------+
With SparkSQL we can join data frames
scala> df_emps.show()
+---+------------+--------------------+--------+-------------+------------+------+--------------------+---------+
| id| last_name| email| gender| department| start_date|salary| job_title|region_id|
+---+------------+--------------------+--------+-------------+------------+------+--------------------+---------+
| 1| 'Kelley'|'rkelley0@soundcl...|'Female'| 'Computers'| '10/2/2009'| 67470|'Structural Engin...| 2|
| 2| 'Armstrong'|'sarmstrong1@info...| 'Male'| 'Sports'| '3/31/2008'| 71869| 'Financial Advisor'| 2|
| 3| 'Carr'|'fcarr2@woothemes...| 'Male'| 'Automotive'| '7/12/2009'|101768|'Recruiting Manager'| 3|
| 4| 'Murray'| 'jmurray3@gov.uk'|'Female'| 'Jewelery'|'12/25/2014'| 96897|'Desktop Support ...| 3|
| 5| 'Ellis'|'jellis4@scienced...|'Female'| 'Grocery'| '9/19/2002'| 63702|'Software Enginee...| 7|
| 6| 'Phillips'|'bphillips5@time....| 'Male'| 'Tools'| '8/21/2013'|118497|'Executive Secret...| 1|
| 7|'Williamson'|'rwilliamson6@ted...| 'Male'| 'Computers'| '5/14/2006'| 65889| 'Dental Hygienist'| 6|
| 8| 'Harris'| 'aharris7@ucoz.com'|'Female'| 'Toys'| '8/12/2003'| 84427|'Safety Technicia...| 4|
| 9| 'James'|'rjames8@prnewswi...| 'Male'| 'Jewelery'| '9/7/2005'|108657| 'Sales Associate'| 2|
| 10| 'Sanchez'|'rsanchez9@cloudf...| 'Male'| 'Movies'| '3/13/2013'|108093|'Sales Representa...| 1|
| 11| 'Jacobs'|'jjacobsa@sbwire....|'Female'| 'Jewelery'|'11/27/2003'|121966|'Community Outrea...| 7|
| 12| 'Black'|'mblackb@edublogs...| 'Male'| 'Clothing'| '2/4/2003'| 44179| 'Data Coordiator'| 7|
| 13| 'Schmidt'|'sschmidtc@state....| 'Male'| 'Baby'|'10/13/2002'| 85227|'Compensation Ana...| 3|
| 14| 'Webb'| 'awebbd@baidu.com'|'Female'| 'Computers'|'10/22/2006'| 59763|'Software Test En...| 4|
| 15| 'Jacobs'|'ajacobse@google.it'|'Female'| 'Games'| '3/4/2007'|141139|'Community Outrea...| 7|
| 16| 'Medina'|'smedinaf@amazona...|'Female'| 'Baby'| '3/14/2008'|106659| 'Web Developer III'| 1|
| 17| 'Morgan'|'dmorgang@123-reg...|'Female'| 'Kids'| '5/4/2011'|148952| 'Programmer IV'| 6|
| 18| 'Nguyen'|'jnguyenh@google....| 'Male'| 'Home'| '11/3/2014'| 93804| 'Geologist II'| 5|
| 19| 'Day'|'rdayi@chronoengi...| 'Male'|'Electronics'| '9/22/2004'|109890| 'VP Sales'| 3|
| 20| 'Carr'| 'dcarrj@ocn.ne.jp'|'Female'| 'Movies'|'11/22/2007'|115274|'VP Quality Control'| 5|
+---+------------+--------------------+--------+-------------+------------+------+--------------------+---------+
only showing top 20 rows
scala> df_cr.show()
+---------+-------------------+---------+
|region_id| company_regions| country|
+---------+-------------------+---------+
| 1| 'Northeast'| 'USA'|
| 2| 'Southeast'| 'USA'|
| 3| 'Northwest'| 'USA'|
| 4| 'Southwest'| 'USA'|
| 5| 'British Columbia'| 'Canada'|
| 6| 'Quebec'| 'Canada'|
| 7| 'Nova Scotia'| 'Canada'|
+---------+-------------------+---------+
// joining 2 dataframes
scala> val df_joined = df_emps.join(df_cr,"region_id")
df_joined: org.apache.spark.sql.DataFrame = [region_id: string, id: string ... 9 more fields]
scala> df_joined.show()
+---------+---+------------+--------------------+--------+-------------+------------+------+--------------------+-------------------
|region_id| id| last_name| email| gender| department| start_date|salary| job_title| company_regions
+---------+---+------------+--------------------+--------+-------------+------------+------+--------------------+-------------------
| 2| 1| 'Kelley'|'rkelley0@soundcl...|'Female'| 'Computers'| '10/2/2009'| 67470|'Structural Engin...| 'Southeast'
| 2| 2| 'Armstrong'|'sarmstrong1@info...| 'Male'| 'Sports'| '3/31/2008'| 71869| 'Financial Advisor'| 'Southeast'
| 3| 3| 'Carr'|'fcarr2@woothemes...| 'Male'| 'Automotive'| '7/12/2009'|101768|'Recruiting Manager'| 'Northwest'
| 3| 4| 'Murray'| 'jmurray3@gov.uk'|'Female'| 'Jewelery'|'12/25/2014'| 96897|'Desktop Support ...| 'Northwest'
| 7| 5| 'Ellis'|'jellis4@scienced...|'Female'| 'Grocery'| '9/19/2002'| 63702|'Software Enginee...| 'Nova Scotia'
| 1| 6| 'Phillips'|'bphillips5@time....| 'Male'| 'Tools'| '8/21/2013'|118497|'Executive Secret...| 'Northeast'
| 6| 7|'Williamson'|'rwilliamson6@ted...| 'Male'| 'Computers'| '5/14/2006'| 65889| 'Dental Hygienist'| 'Quebec'
| 4| 8| 'Harris'| 'aharris7@ucoz.com'|'Female'| 'Toys'| '8/12/2003'| 84427|'Safety Technicia...| 'Southwest'
| 2| 9| 'James'|'rjames8@prnewswi...| 'Male'| 'Jewelery'| '9/7/2005'|108657| 'Sales Associate'| 'Southeast'
| 1| 10| 'Sanchez'|'rsanchez9@cloudf...| 'Male'| 'Movies'| '3/13/2013'|108093|'Sales Representa...| 'Northeast'
| 7| 11| 'Jacobs'|'jjacobsa@sbwire....|'Female'| 'Jewelery'|'11/27/2003'|121966|'Community Outrea...| 'Nova Scotia'
| 7| 12| 'Black'|'mblackb@edublogs...| 'Male'| 'Clothing'| '2/4/2003'| 44179| 'Data Coordiator'| 'Nova Scotia'
| 3| 13| 'Schmidt'|'sschmidtc@state....| 'Male'| 'Baby'|'10/13/2002'| 85227|'Compensation Ana...| 'Northwest'
| 4| 14| 'Webb'| 'awebbd@baidu.com'|'Female'| 'Computers'|'10/22/2006'| 59763|'Software Test En...| 'Southwest'
| 7| 15| 'Jacobs'|'ajacobse@google.it'|'Female'| 'Games'| '3/4/2007'|141139|'Community Outrea...| 'Nova Scotia'
| 1| 16| 'Medina'|'smedinaf@amazona...|'Female'| 'Baby'| '3/14/2008'|106659| 'Web Developer III'| 'Northeast'
| 6| 17| 'Morgan'|'dmorgang@123-reg...|'Female'| 'Kids'| '5/4/2011'|148952| 'Programmer IV'| 'Quebec'
| 5| 18| 'Nguyen'|'jnguyenh@google....| 'Male'| 'Home'| '11/3/2014'| 93804| 'Geologist II'| 'British Columbia'
| 3| 19| 'Day'|'rdayi@chronoengi...| 'Male'|'Electronics'| '9/22/2004'|109890| 'VP Sales'| 'Northwest'
| 5| 20| 'Carr'| 'dcarrj@ocn.ne.jp'|'Female'| 'Movies'|'11/22/2007'|115274|'VP Quality Control'| 'British Columbia'
+---------+---+------------+--------------------+--------+-------------+------------+------+--------------------+-------------------
only showing top 20 rows
// load json file
scala> import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.SparkSession
scala> val spark = SparkSession.builder().appName("DataFrameExercise").getOrCreate()
2018-10-23 21:32:08 WARN SparkSession$Builder:66 - Using an existing SparkSession; some configuration may not take effect.
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@606d6d2c
scala> val df_json_dd = spark.read.json("c://source//dept_div.json")
df_json_dd: org.apache.spark.sql.DataFrame = [company_division: string, department: string]
scala> df_json_dd.show()
+--------------------+-------------+
| company_division| department|
+--------------------+-------------+
| 'Auto & Hardware'| 'Automotive'|
| 'Domestic'| 'Baby'|
| 'Domestic'| 'Beauty'|
| 'Domestic'| 'Clothing'|
|'Electronic Equip...| 'Computers'|
|'Electronic Equip...|'Electronics'|
| 'Domestic'| 'Games'|
| 'Outdoors & Garden'| 'Garden'|
| 'Domestic'| 'Grocery'|
| 'Domestic'| 'Health'|
| 'Domestic'| 'Home'|
| 'Auto & Hardware'| 'Industrial'|
| 'Fashion'| 'Jewelery'|
| 'Domestic'| 'Kids'|
| 'Entertainment'| 'Movies'|
| 'Entertainment'| 'Music'|
| 'Outdoors & Garden'| 'Outdoors'|
| 'Domestic'| 'Shoes'|
| 'Games & Sports'| 'Sports'|
| 'Auto & Hardware'| 'Tools'|
+--------------------+-------------+
only showing top 20 rows
----------------------------------------------------
scala> :paste
// Entering paste mode (ctrl-D to finish)
val a = 10
val b = 20
val c = 30
print (a+b+c)
// Exiting paste mode, now interpreting.
60a: Int = 10
b: Int = 20
c: Int = 30
scala> val i = 1+2
i: Int = 3
scala> val i = 1.+(2)
i: Int = 3
scala> print("Hello World")
Hello World
scala> val s = "Hello World"
s: String = Hello World
scala> s.toUpperCase
res8: String = HELLO WORLD
scala> s.toLowerCase
res9: String = hello world
scala> s.substring(6)
res11: String = World
scala> s.substring(6,8)
res12: String = Wo
s.length -- Tab
scala> s.length
length lengthCompare
scala> s.length -- Tab and one more Tab again
def length(): Int
for (i <- (1 to 100)) { print(i + " ")}
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
scala> :paste
// Entering paste mode (ctrl-D to finish)
var total = 0
for (element <- (1 to 100))
total += element
// Exiting paste mode, now interpreting.
total: Int = 5050
var total = 0
for (element <- (1 to 100))
{
if (element % 2 == 0) total += element
}
// Exiting paste mode, now interpreting.
total: Int = 2550
scala> :paste
// Entering paste mode (ctrl-D to finish)
var totalEven = 0
var totalOdd = 0
for (element <- (1 to 100))
{
if (element % 2 == 0) totalEven += element else totalOdd += element
}
// Exiting paste mode, now interpreting.
totalEven: Int = 2550
totalOdd: Int = 2500
scala> :paste
// Entering paste mode (ctrl-D to finish)
var lb = 1
var ub = 100
var totalEven = 0
var totalOdd = 0
while (lb <= ub)
{
if (lb % 2 == 0) totalEven += lb else totalOdd += lb
lb += 1
}
// Exiting paste mode, now interpreting.
lb: Int = 101
ub: Int = 100
totalEven: Int = 2550
totalOdd: Int = 2500
^l to clear screen (Ctrl + l)
scala> :paste
// Entering paste mode (ctrl-D to finish)
def sum(lb: Int, ub: Int) = {
var total = 0
for(element <- lb to ub)
{
total += element
}
total
}
// Exiting paste mode, now interpreting.
sum: (lb: Int, ub: Int)Int
scala> sum (Tab twice)
def sum(lb: Int,ub: Int): Int
scala> sum(1,10)
res22: Int = 55
scala> :paste
// Entering paste mode (ctrl-D to finish)
def sum(func: Int => Int, lb: Int, ub: Int) = {
var total = 0
for(element <- lb to ub)
{
total += func(element)
}
total
}
// Exiting paste mode, now interpreting.
sum: (func: Int => Int, lb: Int, ub: Int)Int
scala> def id(i: Int) = i
id: (i: Int)Int
scala> def sqr(i: Int) = i * i
sqr: (i: Int)Int
scala> def cube (i : Int) = i * i * i
cube: (i: Int)Int
scala> def double (i: Int) = i * 2
double: (i: Int)Int
Function arguments :
don't give val or var. neither mutable nor immutable
have to give data type
def functionname(arg1 : datatype, arg2 : datatype) : return type
Create a Class:
----------------
scala> class Order(orderId : Int, orderDate : String, orderCustomerId : Int, orderStatus : String){
| println("I am inside Order Constructor")
| }
defined class Order
compile it:
------------
:javap -p Order
Compiled from "<console>"
public class $line3.$read$$iw$$iw$Order {
public $line3.$read$$iw$$iw$Order(int, java.lang.String, int, java.lang.String);
}
scala> val order = new Order(1,"2013-10-01 00:00:00.000", 100,"COMPLETE")
I am inside Order Constructor
order: Order = Order@182cc69e
scala> class Order(orderId : Int, orderDate : String, orderCustomerId : Int, orderStatus : String){
| println ("I am inside Order constructor")
| override def toString = " Order ( " + orderId + "," + orderDate + " ," + orderCustomerId + ", " + orderStatus + ")"
| }
defined class Order
scala> val order = new Order(1,"2013-10-01 00:00:00.000", 100,"COMPLETE")
I am inside Order constructor
order: Order = Order ( 1,2013-10-01 00:00:00.000 ,100, COMPLETE)
scala> println(order);
Order ( 1,2013-10-01 00:00:00.000 ,100, COMPLETE)
REPL : Read Evaluate Print Loop
scala> 10 + 3 * 5 / 2
res0: Int = 17
scala> "Your answer is " + res0
res1: String = Your answer is 17
// var is going to change - variable
scala> var myName = "Derek"
myName: String = Derek
//val is not going to change -- constant / never change
scala> val myAge = 40
myAge: Int = 40
comments
// single line comments
/*
multi line comments
*/
Data Types:
Byte, Boolean, Char, Short, Int, Long, Float, Double
BigInt
scala> val lgPrime = BigInt("6222288956456456456894864564648947895615648978945616549789641561489489489461564894894615618944561564")
lgPrime: scala.math.BigInt = 6222288956456456456894864564648947895615648978945616549789641561489489489461564894894615618944561564
scala> lgPrime+1
res3: scala.math.BigInt = 6222288956456456456894864564648947895615648978945616549789641561489489489461564894894615618944561565
scala> println ("5 + 4 = " + ( 5 + 4 ))
5 + 4 = 9
scala> println ("5 - 4 = " + ( 5 - 4 ))
5 - 4 = 1
scala> println ("5 * 4 = " + ( 5 * 4 ))
5 * 4 = 20
scala> println ("5 / 4 = " + ( 5 / 4 ))
5 / 4 = 1
scala> println ("5 % 4 = " + ( 5 % 4 ))
5 % 4 = 1
import math library:
------------------------
scala> import scala.math._
import scala.math._
scala> ceil(5.45)
res9: Double = 6.0
scala> round(5.45)
res10: Long = 5
scala> floor(5.45)
res11: Double = 5.0
scala> exp(1)
res12: Double = 2.718281828459045
scala> pow(2,2)
res13: Double = 4.0
scala> sqrt(pow(2,2) + pow(2,2))
res14: Double = 2.8284271247461903
scala> hypot(2,2)
res15: Double = 2.8284271247461903
scala> log10(1000)
res16: Double = 3.0
scala> log(2.7182818284590455)
res17: Double = 1.0
scala> min(5,10)
res20: Int = 5
scala> max(1,1000)
res21: Int = 1000
scala> (random * (11-1) + 1).toInt
res22: Int = 7
scala> (random * (11-1) + 1).toInt
res23: Int = 3
scala> var age = 19
age: Int = 19
scala> val canVote = if (age >= 18) "yes" else "no"
canVote: String = yes
// Multiline coding within scala shell CLI (Command Line Interface)
scala> :paste
// Entering paste mode (ctrl-D to finish)
var age = 17
val canVote = if (age >= 18) "yes" else "no"
// Exiting paste mode, now interpreting.
age: Int = 17
canVote: String = no
Exit from Scala
----------------
scala> :q // quit from Scala
C:\scala\bin>
^D to exit
Run Eclipse from here:
C:\scalaIDE\eclipse>eclipse.exe
eclipse:
---------
object ScalaTutorial
{
def main(args : Array[String])
{
var i = 0
while (i <= 10)
{
println(i)
i += 1
}
}
}
4
5
6
7
8
9
10
in CLI:
--------
Run .scala program in CLI
C:\scala\exercise>scala sa.scala
0
1
2
3
4
5
6
7
8
9
10
object ScalaTutorial
{
def main(args : Array[String])
{
var i = 0
while (i <= 10)
{
println(i)
i += 1
}
}
}
Result:
0
1
2
3
4
5
6
7
8
9
10
object ScalaTutorial
{
def main(args : Array[String])
{
var i = 0
do {
println(i)
i += 1
}while (i <= 10)
}
}
Result:
-------
0
1
2
3
4
5
6
7
8
9
10
object ScalaTutorial
{
def main(args : Array[String])
{
var i = 0
for(i <- 1 to 10)
{
println(i)
}
}
}
Result:
1
2
3
4
5
6
7
8
9
10
object ScalaTutorial
{
def main(args : Array[String])
{
var i = 0
val randLetters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
for (i <- 0 until randLetters.length)
{
println(randLetters(i))
}
}
}
Result:
-------
A
B
C
D
E
F
G
H
I
J
K
L
M
N
O
P
Q
R
S
T
U
V
W
X
Y
Z
object ScalaTutorial
{
def main(args : Array[String])
{
var i = 0
val aList = List(1,2,3,4,5,6)
for (i <- aList)
{
println ("List Items #"+i)
}
}
}
Result:
-------
List Items #1
List Items #2
List Items #3
List Items #4
List Items #5
List Items #6
object ScalaTutorial
{
def main(args : Array[String])
{
var i = 0
var evenList = for { i <- 1 to 20
if ( i % 2 == 0)
} yield i
for(i <- evenList)
println(i)
}
}
Result:
2
4
6
8
10
12
14
16
18
20
object ScalaTutorial
{
def main(args : Array[String])
{
var i = 0
for (i <- 1 to 5; j <- 6 to 10)
{
println ("i : " + i)
println ("j : " + j)
}
}
}
Result:
--------
i : 1
j : 6
i : 1
j : 7
i : 1
j : 8
i : 1
j : 9
i : 1
j : 10
i : 2
j : 6
i : 2
j : 7
i : 2
j : 8
i : 2
j : 9
i : 2
j : 10
i : 3
j : 6
i : 3
j : 7
i : 3
j : 8
i : 3
j : 9
i : 3
j : 10
i : 4
j : 6
i : 4
j : 7
i : 4
j : 8
i : 4
j : 9
i : 4
j : 10
i : 5
j : 6
i : 5
j : 7
i : 5
j : 8
i : 5
j : 9
i : 5
j : 10
import scala.io.StdIn.{readLine, readInt}
import scala.math._
import scala.collection.mutable.ArrayBuffer
import java.io.PrintWriter
import scala.io.Source
object ScalaTutorial
{
def main(args : Array[String])
{
var numberGuess = 0
do {
print ("Guess a number ")
numberGuess = readLine.toInt
}while (numberGuess != 15)
printf ("You guessted the secret Number %d",15)
}
}
Result:
-------
Guess a number 3
Guess a number 2
Guess a number 4
Guess a number 15
You guessted the secret Number 15
object ScalaTutorial
{
def main(args : Array[String])
{
val name = "Derek"
val age = 39
val weight = 175.5
println(s"Hello $name")
println(f"I am ${age+1} and weigh $weight%.2f")
}
}
Result:
Hello Derek
I am 40 and weigh 175.50
%c --> character
%s --> string
%f --> float
%d --> integer
object ScalaTutorial
{
def main(args : Array[String])
{
printf("'%5d'\n",5)
printf("'%05d'\n",5)
printf("'%-5s'\n",5)
printf("'%5s'\n",5)
}
}
' 5'
'00005'
'5 '
' 5'
object ScalaTutorial
{
def main(args : Array[String])
{
var randSent = "I saw a dragon fly by"
println ("3rd index : " + randSent(3))
println("String length : " + randSent.length)
println(randSent.concat(" and explode"))
println("Are strings equal : " + "I saw a dragon".equals(randSent))
println("dragon starts at index : " + randSent.indexOf("dragon"))
val randSentArray = randSent.toArray
for (v <- randSentArray)
println (v)
}
}
Result:
-------
3rd index : a
String length : 21
I saw a dragon fly by and explode
Are strings equal : false
dragon starts at index : 8
I
s
a
w
a
d
r
a
g
o
n
f
l
y
b
y
function template:
------------------
def funcName (param1:dataType, param2:dataType) : returnType = {
// function body
// return valueToReturn
}
object ScalaTutorial
{
def main(args : Array[String])
{
def getSum(n1 : Int = 1, n2 : Int = 2) : Int = {
return n1 + n2 ;
}
println ( " 5 + 4 = " + getSum(5,4))
}
}
Result:
5 + 4 = 9
object ScalaTutorial
{
def main(args : Array[String])
{
def getSum(n1 : Int = 1, n2 : Int = 2) : Int = {
return n1 + n2 ;
}
println ( " 5 + 4 = " + getSum(5,4))
println("result= " + getSum(n2 = 3, n1 = 10)) // named arguments
}
}
Result:
5 + 4 = 9
result= 13
// Unit means void function - no return from the function
object ScalaTutorial
{
def main(args : Array[String])
{
def sayHi(): Unit = {
println("Hi! How are you?")
}
sayHi
}
}
Result :
Hi! How are you?
// variable number of parameters in a function
object ScalaTutorial
{
def main(args : Array[String])
{
def getSum(args : Int*) : Int = {
var sum : Int = 0
for (num <- args) {
sum += num
}
sum
}
println ("Get Sum " + getSum(1,2,3,4,5,6))
println ("Get Sum " + getSum(100,200, 300))
}
}
Get Sum 21
Get Sum 600
object ScalaTutorial
{
def main(args : Array[String])
{
def factorial(num : BigInt) : BigInt = {
if (num <= 1)
1
else
num * factorial (num - 1)
}
println("Factorial of 4 = " + factorial(4))
println("Factorial of 5 = " + factorial(5))
}
}
Result:
Factorial of 4 = 24
Factorial of 5 = 120
object ScalaTutorial
{
def main(args : Array[String])
{
val favNums = new Array[Int] (20)
val friends = Array("Bob","Tom")
friends(0) = "Sue"
println("Best Friends " + friends(0))
val friends2 = ArrayBuffer[String]()
friends2.insert(0,"Phil")
friends2 += "Mark"
friends2 ++= Array("Susy","Paul")
friends2.insert(1,"Mike","Sally","Sam","Mary","Sue")
friends2.remove(1,2)
var friend : String = ""
for(friend <- friends2)
println(friend)
}
}
Result:
Best Friends Sue
Phil
Sam
Mary
Sue
Mark
Susy
Paul
object ScalaTutorial
{
def main(args : Array[String])
{
val myList = List(1,2,3,4,5,6,7,8,9,10)
myList.foreach(println)
}
}
Result:
--------
1
2
3
4
5
6
7
8
9
10
object ScalaTutorial
{
def main(args : Array[String])
{
var mulTable = Array.ofDim[Int](10,10)
for(i <- 0 to 9)
{
for (j <- 0 to 9)
{
mulTable(i)(j) = i * j
}
}
for (i <- 0 to 9)
{
for (j <- 0 to 9)
{
printf ("%d : %d = %d\n",i,j,mulTable(i)(j))
}
}
}
}
Result:
0 : 0 = 0
0 : 1 = 0
0 : 2 = 0
0 : 3 = 0
0 : 4 = 0
0 : 5 = 0
0 : 6 = 0
0 : 7 = 0
0 : 8 = 0
0 : 9 = 0
1 : 0 = 0
1 : 1 = 1
1 : 2 = 2
1 : 3 = 3
1 : 4 = 4
1 : 5 = 5
1 : 6 = 6
1 : 7 = 7
1 : 8 = 8
1 : 9 = 9
2 : 0 = 0
2 : 1 = 2
2 : 2 = 4
2 : 3 = 6
2 : 4 = 8
2 : 5 = 10
2 : 6 = 12
2 : 7 = 14
2 : 8 = 16
2 : 9 = 18
3 : 0 = 0
3 : 1 = 3
3 : 2 = 6
3 : 3 = 9
3 : 4 = 12
3 : 5 = 15
3 : 6 = 18
3 : 7 = 21
3 : 8 = 24
3 : 9 = 27
4 : 0 = 0
4 : 1 = 4
4 : 2 = 8
4 : 3 = 12
4 : 4 = 16
4 : 5 = 20
4 : 6 = 24
4 : 7 = 28
4 : 8 = 32
4 : 9 = 36
5 : 0 = 0
5 : 1 = 5
5 : 2 = 10
5 : 3 = 15
5 : 4 = 20
5 : 5 = 25
5 : 6 = 30
5 : 7 = 35
5 : 8 = 40
5 : 9 = 45
6 : 0 = 0
6 : 1 = 6
6 : 2 = 12
6 : 3 = 18
6 : 4 = 24
6 : 5 = 30
6 : 6 = 36
6 : 7 = 42
6 : 8 = 48
6 : 9 = 54
7 : 0 = 0
7 : 1 = 7
7 : 2 = 14
7 : 3 = 21
7 : 4 = 28
7 : 5 = 35
7 : 6 = 42
7 : 7 = 49
7 : 8 = 56
7 : 9 = 63
8 : 0 = 0
8 : 1 = 8
8 : 2 = 16
8 : 3 = 24
8 : 4 = 32
8 : 5 = 40
8 : 6 = 48
8 : 7 = 56
8 : 8 = 64
8 : 9 = 72
9 : 0 = 0
9 : 1 = 9
9 : 2 = 18
9 : 3 = 27
9 : 4 = 36
9 : 5 = 45
9 : 6 = 54
9 : 7 = 63
9 : 8 = 72
9 : 9 = 81
object ScalaTutorial
{
def main(args : Array[String])
{
var r = scala.util.Random
val favNums = new Array[Int] (11)
for (i <- 0 to 10)
{
favNums(i) = r.nextInt(10)
println(favNums(i))
}
for (i <- 0 to 10)
{
println(favNums(i))
}
println("Sum : " + favNums.sum)
println("Min : " + favNums.min)
println("Max : " + favNums.max)
var sortedNums = favNums.sortWith(_>_)
println(sortedNums.deep.mkString(","))
}
}
Result:
-----------
2
8
3
5
9
5
4
5
4
4
2
2
8
3
5
9
5
4
5
4
4
2
Sum : 51
Min : 2
Max : 9
9,8,5,5,5,4,4,4,3,2,2
Spark-shell to get into REPL
Read Evaluate Print Loop (REPL) : interactive shell
Filter - row level, element level filter
scala> val myList = List(10,20,30,40,50,55,20,32)
myList: List[Int] = List(10, 20, 30, 40, 50, 55, 20, 32)
scala> val myResult = myList.filter( x => x > 30)
myResult: List[Int] = List(40, 50, 55, 32)
myList, myResult are local scala objects
map,flatmap - to do transformations
scala> val names = List(" ravi "," rani", " veni", " venu ")
names: List[String] = List(" ravi ", " rani", " veni", " venu ")
scala> val newNames = names.map (x => x.trim().toUpperCase())
newNames: List[String] = List(RAVI, RANI, VENI, VENU)
scala> val a = 100
a: Int = 100
scala> val b = 200
b: Int = 200
scala> val c = a + b
c: Int = 300
scala> print (c)
300
Data Types are automatically constructed by Scala based on the given input
scala> val a = 100
a: Int = 100
scala> val b = 100.32
b: Double = 100.32
scala> val c = "Rama"
c: String = Rama
Everything is an objects
variable members, method members
scala> val name = "Ravi"
name: String = Ravi
scala> val len = name.length
len: Int = 4
Batch process involves more and more with collections
scala> val name = "Ravi"
name: String = Ravi
scala> val len = name.length
len: Int = 4
scala> val a : Int = 100
a: Int = 100
scala> val a = 100
a: Int = 100
scala> val b : Double = 100.01
b: Double = 100.01
scala> val name : String = "Simbu"
name: String = Simbu
scala> val a = 100 // immutable / non changeable
a: Int = 100
scala> var b = 200 // mutable / changing is possible
b: Int = 200
scala> println(a)
100
scala> println(b)
200
scala> a = a+4
<console>:27: error: reassignment to val
a = a+4
^
scala> b = 500+b
b: Int = 700
val is Constant (immutable / non changeable / cannot be reassigned / read only)
var is variable (mutable / changeable / can be reassigned / read & write)
scala> val x = List(1,2,3,4,5)
x: List[Int] = List(1, 2, 3, 4, 5)
scala> x.sum
res6: Int = 15
scala> val x = List(1,2,3,4,5)
x: List[Int] = List(1, 2, 3, 4, 5)
scala> var t = 0 // t is a variable so the below statement is valid
t: Int = 0
scala> for (v <- x)
| t = t+v
scala> print(v)
15
scala> x.sum
res6: Int = 15
scala> val x = List(10,20,30,40,50)
x: List[Int] = List(10, 20, 30, 40, 50)
scala> val t = 0 // t is a constant so the below statement is invalid
t: Int = 0
scala> for (v <- x)
| t = t+v
<console>:31: error: reassignment to val // va; doesn't allow reassignment
t = t+v
^
everytime, the loop iterated, tot + v is being reassigned to tot object.
Array and List both come under Collection
List can Grow but Array Can'the
Array size is fixed but List size is dynamic
List and Array are for homogeneous items. purpose of each element should be same
scala> val names = Array("Ravi","Rani","Mukesh")
names: Array[String] = Array(Ravi, Rani, Mukesh)
scala> println(names(0))
Ravi
scala> names
res14: Array[String] = Array(sare, Rani, Mukesh)
scala> names(names.size-1) // to get last name
res15: String = Mukesh
scala> names.length
res16: Int = 3
scala> names.size
res17: Int = 3
scala> val myList = List(10,20,30,40)
myList: List[Int] = List(10, 20, 30, 40)
scala> myList(0)
res18: Int = 10
scala> myList(myList.size-1)
res19: Int = 40
scala> val x = List(1,3,5,7,9,10,15,20,30,40)
x: List[Int] = List(1, 3, 5, 7, 9, 10, 15, 20, 30, 40)
scala> x.size
res20: Int = 10
scala> x.length
res21: Int = 10
scala> x.slice(3,6)
res23: List[Int] = List(7, 9, 10)
scala> x.slice(0,5)
res24: List[Int] = List(1, 3, 5, 7, 9)
scala> x.slice(5,x.size)
res25: List[Int] = List(10, 15, 20, 30, 40)
scala> var y = List(10,20,30)
y: List[Int] = List(10, 20, 30)
scala> 333::y // prepending (adding new element at the beginning) // valid
res27: List[Int] = List(333, 10, 20, 30)
scala> y::311 // appending an element at the end is not allowed
<console>:28: error: value :: is not a member of Int
y::311
^
scala> val a = 10
a: Int = 10
scala> a = 100 // reassignment is not allowed
<console>:27: error: reassignment to val
a = 100
^
scala> val a = 100 // destructing existing and constructing new is allowed
a: Int = 100
scala> var y = List(10,20,30)
res29: List[Int] = List(10, 20, 30)
scala> val y = 40::y // reassigning recursive not allowed
<console>:27: error: recursive value y needs type
val y = 40::y
^
scala> val x = 40::y // here we create a new variable so allowed
x: List[Int] = List(40, 10, 20, 30)
// All the below things are valid because we create new variables only
scala> val y = List(10,20,30)
y: List[Int] = List(10, 20, 30)
scala> val z = 40::y
z: List[Int] = List(40, 10, 20, 30)
scala> val u = 50::z
u: List[Int] = List(50, 40, 10, 20, 30)
Recursive operation is allowed in var
scala> var a = List(1,2,3,4,5)
a: List[Int] = List(1, 2, 3, 4, 5)
scala> a = 40::a
a: List[Int] = List(40, 1, 2, 3, 4, 5)
scala> a = 4::a
a: List[Int] = List(4, 40, 1, 2, 3, 4, 5)
scala> a = 3::a
a: List[Int] = List(3, 4, 40, 1, 2, 3, 4, 5)
Recursive is valid for var
Recursive is invalid for val
/// Merging 2 different lists
scala> val x = List(1,2,3,4,5,6,7,8,9,10)
x: List[Int] = List(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
scala> val y = List(11,12,13,14,15)
y: List[Int] = List(11, 12, 13, 14, 15)
scala> x.size
res30: Int = 10
scala> y.size
res31: Int = 5
scala> x++y // inserting y at the end of x
res32: List[Int] = List(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
scala> y++x // inserting y at the beginning of x
res33: List[Int] = List(11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
scala> x.union(y) // combining y with x
res34: List[Int] = List(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
scala> y.union(x) // combining x with y
res35: List[Int] = List(11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
++ is the symbol for merging
Tuple : Hetrogeneous means different kind of mixed data types. purpose of each elements are different
scala> val t = ("Ravi",35,"BTech","Hyderabad")
t: (String, Int, String, String) = (Ravi,35,BTech,Hyderabad)
scala> val t = ("Ravi",35,"BTech","Hyderabad")
t: (String, Int, String, String) = (Ravi,35,BTech,Hyderabad)
scala> t._1 // have to specify position and not index
res40: String = Ravi
scala> t._2
res41: Int = 35
scala> t._3
res42: String = BTech
scala> t._4
res43: String = Hyderabad
List and Array : Homogenous it wont support mixed data types
Tuple : Hetrogeneous and supports mixed data type
Map :
collection of key, value pairs
scala> var capitals = Map("Andhra"->"Amaravathi", "Telengana"->"Hyderabad")
capitals: scala.collection.immutable.Map[String,String] = Map(Andhra -> Amaravathi, Telengana -> Hyderabad)
scala> capitals
res44: scala.collection.immutable.Map[String,String] = Map(Andhra -> Amaravathi, Telengana -> Hyderabad)
scala> capitals("Telengana")
res49: String = Hyderabad
scala> capitals("Andhra")
res50: String = Amaravathi
capitals += ("Karnataka"->"Bangalore")
scala> capitals += ("Kerala"->"Trivandrum")
scala> capitals.foreach(println)
(Andhra,Amaravathi)
(Telengana,Hyderabad)
(Karnataka,Bangalore)
(Kerala,Trivandrum)
Transformation
to do a (something) process against each and every individual elements of a collection
Transformations :
map,
flatmap
filter
some operation you want to do for each and every elements of a given collection
scala> var myList = List(10,20,30,40,50)
myList: List[Int] = List(10, 20, 30, 40, 50)
scala> var myResult = myList.map( x => x + 5) // for each value of x, I am going to add 5
myResult: List[Int] = List(15, 25, 35, 45, 55)
scala> var myUpdatedResult = myList.map (_ + 5) // short hand
myUpdatedResult: List[Int] = List(15, 25, 35, 45, 55)
map ( x => x + 5 ) =====> equal to ----> map ( _ + 5)
scala> myUpdatedResult.sum
res57: Int = 175
scala> var myBoolResult = x.map( x => x > 30)
myBoolResult: List[Boolean] = List(false, false, false, false, false, false, false, false, false, false)
scala> val z = myResult.map(_ > 30) // short hand
z: List[Boolean] = List(false, false, true, true, true)
InitCap Example:
---------------
scala> val names = List(" rAvI "," aNandhI ", " sUriyA "," BiNU")
names: List[String] = List(" rAvI ", " aNandhI ", " sUriyA ", " BiNU")
scala> val trimmedNames = names.map(x => x.trim)
trimmedNames: List[String] = List(rAvI, aNandhI, sUriyA, BiNU)
scala> val initCap = trimmedNames.map (x => x.substring(0,1).toUpperCase + x.substring(1,x.size).toLowerCase)
initCap: List[String] = List(Ravi, Anandhi, Suriya, Binu)
scala> initCap.foreach(println)
Ravi
Anandhi
Suriya
Binu
scala> val x = List(10,20,30,40,50)
x: List[Int] = List(10, 20, 30, 40, 50)
scala> val y = x.map (v => v+122)
y: List[Int] = List(132, 142, 152, 162, 172)
scala> val z = y.map(v => v > 150)
z: List[Boolean] = List(false, false, true, true, true)
Short hand formatted:
---------------------
scala> val x = List(10,20,30,40,50)
x: List[Int] = List(10, 20, 30, 40, 50)
scala> val y = x.map(_ + 122)
y: List[Int] = List(132, 142, 152, 162, 172)
scala> val z = y.map (_ > 150)
z: List[Boolean] = List(false, false, true, true, true)
scala> val str = "computer".toUpperCase
str: String = COMPUTER
scala> val names = List("RavI","raHUL","AnU","moHInI","BhaNU")
names: List[String] = List(RavI, raHUL, AnU, moHInI, BhaNU)
scala> val name2 = names.map(_.toUpperCase) // short hand
name2: List[String] = List(RAVI, RAHUL, ANU, MOHINI, BHANU)
scala> val name2 = names.map(x => x.toUpperCase)
name2: List[String] = List(RAVI, RAHUL, ANU, MOHINI, BHANU)
scala> name2.foreach(println)
RAVI
RAHUL
ANU
MOHINI
BHANU
val sals = List(10000,20000,3000,4000)
// netsal ------> tax 10%, HRA 20%
Calculation inside map:
scala> val myInput = List(1000,2000,2500,3000,4000,5000)
myInput: List[Int] = List(1000, 2000, 2500, 3000, 4000, 5000)
scala> val net = myInput.map { sal =>
| val tax = sal * 10/100
| val hra = sal * 20/100
| val net = sal + hra - tax
| net
| }
net: List[Int] = List(1100, 2200, 2750, 3300, 4400, 5500)
initCap:
--------
scala> val names = myInput.map { x =>
| val w = x.trim()
| val fc = w.substring(0,1)
| val fupp = fc.toUpperCase
| val rc = w.substring(1).toLowerCase
| fupp+rc
| }
names: List[String] = List(Rahul, Mayavi, Anandhi, Superstar)
scala> names.foreach(println)
Rahul
Mayavi
Anandhi
Superstar
scala> val my = List(10,20,30,40,50)
my: List[Int] = List(10, 20, 30, 40, 50)
scala> my.slice(2,5)
res3: List[Int] = List(30, 40, 50)
scala> my.slice(0,3)
res4: List[Int] = List(10, 20, 30)
// Take 1st 3 elements
//select * from my limit 3
scala> my.take(3)
res5: List[Int] = List(10, 20, 30)
// from 3 elements to remaining all
scala> my.slice(2,x.size)
res6: List[Int] = List(30, 40, 50)
extract characters from a given string:
--------------------------------------
scala> val str = "Computer"
str: String = Computer
scala> str.substring(4,7)
res7: String = ute
scala> str.substring(3,6)
res8: String = put
scala> str.slice(3,6)
res11: String = put
InitCap Using slice:
--------------------
scala> val names = List("RaHUL","MaYAvi","AnanThI","SUperStaR")
names: List[String] = List(RaHUL, MaYAvi, AnanThI, SUperStaR)
scala> val res = names.map{ x =>
| val w = x.trim()
| val fc = x.slice(0,1).toUpperCase
| val rc = w.slice(1,w.size).toLowerCase
| fc + rc
| }
res: List[String] = List(Rahul, Mayavi, Ananthi, Superstar)
I have a List which contains many lists inside:
//consider inside lists are equivalent to 4 days transactions
scala> val myL = List(List(0,1,2),List(3,4),List(5,6,7,8),List(9,10))
myL: List[List[Int]] = List(List(0, 1, 2), List(3, 4), List(5, 6, 7, 8), List(9, 10))
scala> val re1 = myL.map(x => x.size)
re1: List[Int] = List(3, 2, 4, 2)
scala> val re3 = myL.map (_.sum)
re3: List[Int] = List(3, 7, 26, 19)
scala> val re3 = myL.map (x => x.sum)
re3: List[Int] = List(3, 7, 26, 19)
map gives individual day's size (count / length) and sum of individual lists inside the each list.
But I want to calculate the total sum of all the elements of every list
Flat Map:
scala> val myL = List(List(0,1,2),List(3,4),List(5,6,7,8),List(9,10))
myL: List[List[Int]] = List(List(0, 1, 2), List(3, 4), List(5, 6, 7, 8), List(9, 10))
// flat map merges all the inner lists and make single merged lists.
// all the elements of child lists merged together to make a single list
scala> val l2 = myL.flatMap(x => x
l2: List[Int] = List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
//sum of the result of flat map (flattened result's sum)
scala> myL.flatMap(x=>x).sum
res18: Int = 55
Word Count:
----------
scala> val lines = List("I love hadoop","I love spark", "I love spark and hadoop", "spark is great")
lines: List[String] = List(I love hadoop, I love spark, I love spark and hadoop, spark is great)
Convert each word into key,value pairs
I,1
love,1
hadoop,1
I,1
love,1
spark,1
I,1
love,1
spark,1
and,1
hadoop,1
spark,1
is,1
great,1
scala> val w1 = lines.map(x => x.split(" "))
w1: List[Array[String]] = List(Array(I, love, hadoop), Array(I, love, spark), Array(I, love, spark, and, hadoop), Array(spark, is, great))
scala> val w2 = lines.map(_.split(" "))
w2: List[Array[String]] = List(Array(I, love, hadoop), Array(I, love, spark), Array(I, love, spark, and, hadoop), Array(spark, is, great))
//flatMap - Short hand
scala> val w3flattened = lines.flatMap(_.split(" "))
w3flattened: List[String] = List(I, love, hadoop, I, love, spark, I, love, spark, and, hadoop, spark, is, great)
//flatmap - regular way
scala> val w3flattened = lines.flatMap(x => x.split(" "))
w3flattened: List[String] = List(I, love, hadoop, I, love, spark, I, love, spark, and, hadoop, spark, is, great)
//make key,value pairs
scala> val pair = w3flattened.map((_,1))
pair: List[(String, Int)] = List((I,1), (love,1), (hadoop,1), (I,1), (love,1), (spark,1), (I,1), (love,1), (spark,1), (and,1), (hadoop,1), (spark,1), (is,1), (great,1))
scala> val pair = w3flattened.map(x => (x,1))
pair: List[(String, Int)] = List((I,1), (love,1), (hadoop,1), (I,1), (love,1), (spark,1), (I,1), (love,1), (spark,1), (and,1), (hadoop,1), (spark,1), (is,1), (great,1))
scala> var recs = List(
| "101,Amar,4000,m,11",
| "102,Nila,3000,f,12",
| "103,Kala,4500,f,11",
| "104,Anil,3000,m,13",
| "105,Arul,3200,m,15")
recs: List[String] = List(101,Amar,4000,m,11, 102,Nila,3000,f,12, 103,Kala,4500,f,11, 104,Anil,3000,m,13, 105,Arul,3200,m,15)
scala> recs.foreach(println)
101,Amar,4000,m,11
102,Nila,3000,f,12
103,Kala,4500,f,11
104,Anil,3000,m,13
105,Arul,3200,m,15
#Requirement:
//select sex,sum(sal) from emp group by sex;
we have to make the following:
// expectation : Array((m,4000),(f,3000),(f,4500),(m,3000),(m,3200))
scala> val arr = recs.map(x => x.split(","))
arr: List[Array[String]] = List(Array(101, Amar, 4000, m, 11), Array(102, Nila, 3000, f, 12), Array(103, Kala, 4500, f, 11), Array(104, Anil, 3000, m, 13), Array(105, Arul, 3200, m, 15))
//split will split based on space as delimiter and extract all the fields
// but we need to extract salary and sex fields only
scala> val salsexpair = arr.map(x => (x(3),x(2).toInt))
salsexpair: List[(String, Int)] = List((m,4000), (f,3000), (f,4500), (m,3000), (m,3200))
If you have sub collections, flatmap is needed
// ListOfList, ListOfArray
scala> val r = data.map { x =>
| val w = x.split(",")
| val cnt = w.size
| cnt
| }
r: List[Int] = List(3, 5, 2, 4)
Map : is performing count (size / length) operation for individual lists
scala> val r = data.flatMap { x =>
| x.split(",")
| }
r: List[String] = List(100, 200, 500, 800, 200, 300, 400, 500, 10000, 3000, 900, 1000, 5000, 10000)
scala> r.size
res5: Int = 14
scala> r.foreach(println)
100
200
500
800
200
300
400
500
10000
3000
900
1000
5000
10000
scala> var i = 0
i: Int = 0
scala> for (i <- r) { println(i)}
100
200
500
800
200
300
400
500
10000
3000
900
1000
5000
10000
Filter operation:
------------------
Input:
scala> val objA = List(List(1,2,3),List(3,4),List(1,3,5,6),List(1,2,3))
objA: List[List[Int]] = List(List(1, 2, 3), List(3, 4), List(1, 3, 5, 6), List(1, 2, 3))
Filter who has more than 2 elements?
scala> val r = objA.filter (x => x.size > 2)
r: List[List[Int]] = List(List(1, 2, 3), List(1, 3, 5, 6), List(1, 2, 3))
Filter who has more than 3 elements?
scala> val r = objA.filter (x => x.size > 3)
r: List[List[Int]] = List(List(1, 3, 5, 6))
Here we are going to do filter only male / female records
scala> recs
res12: List[String] = List(101,Amar,4000,m,11, 102,Nila,3000,f,12, 103,Kala,4500,f,11, 104,Anil,3000,m,13, 105,Arul,3200,m,15)
scala> recs.foreach(println)
101,Amar,4000,m,11
102,Nila,3000,f,12
103,Kala,4500,f,11
104,Anil,3000,m,13
105,Arul,3200,m,15
^
Wrong approach:
----------------
scala> val males = recs.filter(x => x.contains("m"))
males: List[String] = List(101,Amar,4000,m,11, 104,Anil,3000,m,13, 105,Arul,3200,m,15)
Correct approach:
------------------
scala> val males = recs.filter { x =>
| val sex = x.split(",")(3).toLowerCase
| sex == "m"
| }
males: List[String] = List(101,Amar,4000,m,11, 104,Anil,3000,m,13, 105,Arul,3200,m,15)
scala> males.foreach(println)
101,Amar,4000,m,11
104,Anil,3000,m,13
105,Arul,3200,m,15
Here we are filtering only female candidates:
---------------------------------------------
scala> val femals = recs.filter{ x =>
| val sex = x.split(",")(3).toLowerCase
| sex == "f"
| }
femals: List[String] = List(102,Nila,3000,f,12, 103,Kala,4500,f,11)
scala> femals.foreach(println)
102,Nila,3000,f,12
103,Kala,4500,f,11
val words = List(" I love India ", " I love Cricket ", " I love Big Data")
words: List[String] = List(" I love India ", " I love Cricket ", " I love Big Data")
scala> val w = words.map { x =>
| val xx = x.trim()
| val fc = xx.slice(0,1).toUpperCase
| val len = xx.length
| val rc = xx.slice(1,len)
| fc+rc
| }
w: List[String] = List(I love India, I love Cricket, I love Big Data)
scala> val words = List(" wHO is GoINg tO KiLL mE? "," Aruji is ARUmUGAm "," SheKARIndian ")
words: List[String] = List(" wHO is GoINg tO KiLL mE? ", " Aruji is ARUmUGAm ", " SheKARIndian ")
scala> val w = words.map { x =>
| val xx = x.trim()
| val fc = xx.slice(0,1).toUpperCase
| val rc = xx.slice(1,xx.length).toLowerCase
| fc+rc
| }
w: List[String] = List(Who is going to kill me?, Aruji is arumugam, Shekarindian)
scala> val x = List(10,20,30,40,50,60,70)
x: List[Int] = List(10, 20, 30, 40, 50, 60, 70)
scala> val y = x.map ( x => x <= 40)
y: List[Boolean] = List(true, true, true, true, false, false, false)
scala> val y = x.map ( x => x >= 40)
y: List[Boolean] = List(false, false, false, true, true, true, true)
scala> val y = x.map ( x => x != 40)
y: List[Boolean] = List(true, true, true, false, true, true, true)
scala> val y = x.map ( x => x == 40)
y: List[Boolean] = List(false, false, false, true, false, false, false)
scala> val lst = List(10,11,12,13,14,15)
lst: List[Int] = List(10, 11, 12, 13, 14, 15)
scala> lst.sum/lst.size
res0: Int = 12
scala> lst.sum/lst.length
res1: Int = 12
scala> val averageCheck = lst.map { x =>
| val sum = lst.sum
| val avg = sum / lst.length
| if (x >= avg) "Above" else "Below"
| }
averageCheck: List[String] = List(Below, Below, Above, Above, Above, Above)
scala> val a = 100
a: Int = 100
scala> val b = 200
b: Int = 200
scala> val big = if (a>b) a else b
big: Int = 200
scala> val small = if (a<b) a else b
small: Int = 100
scala> val biggest = if (a>b) { if (a>c) a else b} else {if (b>c) b else c}
biggest: Int = 300
Conditional Transformations:
---------------------------
scala> val dNos = List(11,12,13,11,11,12,13,12,12,13,14,15,11)
dNos: List[Int] = List(11, 12, 13, 11, 11, 12, 13, 12, 12, 13, 14, 15, 11)
val dNames = dNos.map { x =>
if (x ==11) "Marketing" else
if (x ==12) "HR" else
if (x ==13) "Finance" else
if (x ==14) "Sales" else
if (x ==15) "R&D"
}
dNames: List[Any] = List(Marketing, HR, Finance, Marketing, Marketing, HR, Finance, HR, HR, Finance, Sales, R&D, Marketing)
scala> val sex = "M"
sex: String = M
scala> val res = (sex.toUpperCase =="M") match {
| case true => "Male"
| case false => "Female"
| }
res: String = Male
scala> val sex = "f"
sex: String = f
scala> val res = (sex.toUpperCase =="M") match {
| case true => "Male"
| case false => "Female"
| }
res: String = Female
scala> val sex = "m"
sex: String = m
scala> val res = if (sex.toUpperCase =="M") "Male" else "Female"
res: String = Male
scala> val sex = "m"
sex: String = m
scala> val res = sex match {
| case "m" => "Male"
| case "f" => "Female"
| case other => "Unknown"
| }
res: String = Male
name --> initCap First Char into Upper case + remaining characters into lower case
---> generate tax, HRA
---> based on net ranges, classify them into Grades : A,B,C,D
---> sex
"m" / "M"
"f" / "F"
department numbers
11 -> Marketing, 12 -> HR, 13 -> Finance, 14 -> Others
Given input data:
scala> val recs = List ("101,Amar,40000,m,11",
| "102,aMaLa,80000,F,12",
| "103,MaNI,10000,m,13",
| "104,GiRI,45000,m,14",
| "105,SuREsh,60000,f,12",
| "106,SiRI,9000,M,15")
recs: List[String] = List(101,Amar,40000,m,11, 102,aMaLa,80000,F,12, 103,MaNI,10000,m,13, 104,GiRI,45000,m,14, 105,SuREsh,60000,f,12, 106,SiRI,9000,M,15)
scala> recs.foreach(println)
101,Amar,40000,m,11
102,aMaLa,80000,F,12
103,MaNI,10000,m,13
104,GiRI,45000,m,14
105,SuREsh,60000,f,12
106,SiRI,9000,M,15
scala> val trans = recs.map { x =>
val w = x.split(",")
val id = w(0)
val name = w(1).trim()
val sal = w(2).toInt
var sex = w(3)
val dno = w(4).toInt
val fc = name.slice(0,1).toUpperCase
val rc = name.slice(1,name.size).toLowerCase
val newName = fc+rc
sex = if (sex.toUpperCase =="M") "Male" else "Female"
val tax = sal * 10/100
val HRA = sal * 20 / 100
val net = sal + HRA - tax
var grade = " "
val dname = dno match {
case 11 => "Marketing"
case 12 => "HR"
case 13 => "Finance"
case other => "Others"
}
val newList = List(id,newName,sal.toString,HRA.toString,tax.toString,net.toString,grade,sex,dno.toString,dname)
newList
}
trans: List[List[String]] = List(List(101, Amar, 40000, 8000, 4000, 44000, " ", Male, 11, Marketing), List(102, Amala, 80000, 16000, 8000, 88000, " ", Female, 12, HR), List(103, Mani, 10000, 2000, 1000, 11000, " ", Male, 13, Finance), List(104, Giri, 45000, 9000, 4500, 49500, " ", Male, 14, Others), List(105, Suresh, 60000, 12000, 6000, 66000, " ", Female, 12, HR), List(106, Siri, 9000, 1800, 900, 9900, " ", Male, 15, Others))
Result:
scala> trans.foreach(println)
List(101, Amar, 40000, 8000, 4000, 44000, , Male, 11, Marketing)
List(102, Amala, 80000, 16000, 8000, 88000, , Female, 12, HR)
List(103, Mani, 10000, 2000, 1000, 11000, , Male, 13, Finance)
List(104, Giri, 45000, 9000, 4500, 49500, , Male, 14, Others)
List(105, Suresh, 60000, 12000, 6000, 66000, , Female, 12, HR)
List(106, Siri, 9000, 1800, 900, 9900, , Male, 15, Others)
scala> val emp = Array("101,aaaa,30000,m,11", "102,bbbb,50000,f,12","103,hhh,60000,m,11", "104,qqq,8000,f,11")
emp: Array[String] = Array(101,aaaa,30000,m,11, 102,bbbb,50000,f,12, 103,hhh,60000,m,11, 104,qqq,80000,f,11)
we are going to make a query like this :
//select sex,sum(sal) from emp group by sex
In order to perform this kind of operation, input must be a pair RDD (key,value pairs)
pair is a tupe which contains key,values
Array ((m,30000),(f,50000),(m,60000),(f,8000))
scala> emp.foreach(println)
101,aaaa,30000,m,11
102,bbbb,50000,f,12
103,hhh,60000,m,11
104,qqq,80000,f,11
scala> val pair1 = emp.map { x =>
| val w = x.split(",")
| val sex = w(3)
| val sal = w(2).toInt
| val t = (sex,sal)
| t
| }
pair1: Array[(String, Int)] = Array((m,30000), (f,50000), (m,60000), (f,80000))
// select dno,sum(sal) from emp group by dno
scala> val pair2 = emp.map { x =>
| val w = x.split(",")
| val dno = w(4)
| val sal = w(2).toInt
| (dno,sal)
| }
pair2: Array[(String, Int)] = Array((11,30000), (12,50000), (11,60000), (11,80000))
// select dno,sex,sum(sal) from emp group by dno,sex
// --> Array(((11,m),30000), (12,f),50000)...
scala> val pair3 = emp.map { x=>
| val w = x.split(",")
| val dno = w(4)
| val sex = w(3)
| val sal = w(2).toInt
| val myKey = (dno,sex)
| val t = (myKey,sal)
| t
| }
pair3: Array[((String, String), Int)] = Array(((11,m),30000), ((12,f),50000), ((11,m),60000), ((11,f),80000))
//Making records into structure
2 ideas :
1) tupe, 2) case classes
Tuple : Collection of hetrogeneous elements
Convert the following records into tuples.
scala> emp.foreach(println)
101,aaaa,30000,m,11
102,bbbb,50000,f,12
103,hhh,60000,m,11
104,qqq,80000,f,11
scala> val recs = emp.map { x =>
| val w = x.split(",")
| val id = w(0).toInt
| val name = w(1)
| val sal = w(2).toInt
| val sex = w(3)
| val dno = w(4).toInt
| (id,name,sal,sex,dno)
| }
recs: Array[(Int, String, Int, String, Int)] = Array((101,aaaa,30000,m,11), (102,bbbb,50000,f,12), (103,hhh,60000,m,11), (104,qqq,80000,f,11))
// Here we transformed each records into tuples
scala> recs.foreach(println)
(101,aaaa,30000,m,11)
(102,bbbb,50000,f,12)
(103,hhh,60000,m,11)
(104,qqq,80000,f,11)
Transform tuple into key,value pairs
input is array of tuples
scala> val pair4 = recs.map { x =>
| (x._4,x._3.toInt)
| }
pair4: Array[(String, Int)] = Array((m,30000), (f,50000), (m,60000), (f,80000))
Tuple to kvp
// select sum(sal) from recs
scala> recs.foreach(println)
(101,aaaa,30000,m,11)
(102,bbbb,50000,f,12)
(103,hhh,60000,m,11)
(104,qqq,80000,f,11)
// x._ this means posision number of an element
scala> val sals = recs.map(x => x._3).sum
sals: Int = 220000
scala> val sals = recs.map(x => x._3)
sals: Array[Int] = Array(30000, 50000, 60000, 80000)
scala> sals.sum
res10: Int = 220000
// filter and transformation together
many elements are "" - blank spaces - we are going to filter them
scala> val textData = " I love Spark "
textData: String = " I love Spark "
scala> val text = textData.trim()
text: String = I love Spark
scala> val w = text.split(" ")
w: Array[String] = Array(I, "", "", "", "", "", love, "", "", "", "", Spark)
scala> val words = w.filter(x => x != "")
words: Array[String] = Array(I, love, Spark)
scala> val newLine = words.mkString(" ")
newLine: String = I love Spark
// while dealing with unstructured data like tweets, posts in twitter, fb
to do sentiment analysis we need to do cleaning unnecessary spaces
scala> val comment = List(" I love Spark ", " You love Hadoop "," Hadoop and spark are great big data systems ")
comment: List[String] = List(" I love Spark ", " You love Hadoop ", " Hadoop and spark are great big data systems ")
scala> val newComment = comment.map { line =>
| val w = line.trim().split(" ")
| val words = w.filter(x => x != "")
| words.mkString(" ")
| }
newComment: List[String] = List(I love Spark, You love Hadoop, Hadoop and spark are great big data systems)
scala> newComment.foreach(println)
I love Spark
You love Hadoop
Hadoop and spark are great big data systems
our records have just 5 fields like
(101,aaaa,3000,m,11) ==> (id,name,salary,sex,dno)
But if we have 100s of fields, its very difficult to identify position or index of each elements to do processing
we are going to use case classes to identify field names directly instead of index numbers or position numbers
Tuple doesnt have schema
select name,city from emp ---> sql table has schema so we directly access fields in queries
scala> case class Samp(a:Int, b:Int, c:Int)
defined class Samp
scala> val s1 = Samp(10,20,30)
s1: Samp = Samp(10,20,30)
scala> s1.a+s1.b+s1.b
res1: Int = 50
scala> val s1 = Samp(10,20,30)
s1: Samp = Samp(10,20,30)
scala> val myList = List(10,20,30)
myList: List[Int] = List(10, 20, 30)
// Using Index number we can fetch
scala> myList(1)
res56: Int = 20
scala> val myTuple = (10,20,30)
myTuple: (Int, Int, Int) = (10,20,30)
// Using position number we can fetch
scala> myTuple._2
res58: Int = 20
List is for homogeneous collection
Access List item using index
Tuple is for hetrogeneous collection
Access Tuple Item using position
scala> val a = 100
a: Int = 100
// use semicolan (;) to differentiate multiple lines
scala> val b = a*2; val c = b*2
b: Int = 200
c: Int = 400
// Blocking grouping --
scala> val r = {
| val x = 10;
| val y = x+10;
| val z = x-20;
| (x,y,z)
| }
r: (Int, Int, Int) = (10,20,-10)
function to add 2 numbers:
scala> def add(x:Int, y:Int) = x + y
add: (x: Int, y: Int)Int
my intention is the result should come like this : (firstNumber,secondNumber,AddedResult)
scala> val r = {
| val result = (10,20,add(10,20))
| result
| }
r: (Int, Int, Int) = (10,20,30)
Unit == () == void == no return values
scala> val dayNo = 3
dayNo: Int = 3
scala> val dayName = dayNo match {
| case 1 => "Monday"
| case 2 => "TuesDay"
| case 3 => "Wed"
| case 4 => "Thursday"
| case 5 => "Friday"
| case 6 => "Saturday"
| case 7 => "Sunday"
| }
dayName: String = Wed
scala> val dNo = 4
dNo: Int = 4
scala> if (dNo < 6) "Working Day" else "Holiday"
res60: String = Working Day
scala> val dNo = 7
dNo: Int = 7
scala> if (dNo<6) "Working Day" else "Holiday"
res63: String = Holiday
scala> val s2 = Samp(1,2,3)
s2: Samp = Samp(1,2,3)
scala> val s3 = Samp(100,200,300)
s3: Samp = Samp(100,200,300)
scala> s1.b
res2: Int = 20
scala> s1.a
res3: Int = 10
scala> s1.c
res4: Int = 30
scala> s2.a
res5: Int = 1
scala> s3.b
res6: Int = 200
scala> val s = List(s1,s2,s3)
s: List[Samp] = List(Samp(10,20,30), Samp(1,2,3), Samp(100,200,300))
scala> val s = List(s1,s2,s3)
s: List[Samp] = List(Samp(10,20,30), Samp(1,2,3), Samp(100,200,300))
scala> s.foreach(println)
Samp(10,20,30)
Samp(1,2,3)
Samp(100,200,300)
// find the sum of individual Samp inside the main List
scala> val r = s.map(x => x.a + x.b + x.c)
r: List[Int] = List(60, 6, 600)
scala> val emp = Array("101,aaaa,30000,m,11", "102,bbbb,50000,f,12","103,hhh,60000,m,11", "104,qqq,8000,f,11")
emp: Array[String] = Array(101,aaaa,30000,m,11, 102,bbbb,50000,f,12, 103,hhh,60000,m,11, 104,qqq,8000,f,11)
scala> emp
res10: Array[String] = Array(101,aaaa,30000,m,11, 102,bbbb,50000,f,12, 103,hhh,60000,m,11, 104,qqq,8000,f,11)
scala> emp.foreach(println)
101,aaaa,30000,m,11
102,bbbb,50000,f,12
103,hhh,60000,m,11
104,qqq,8000,f,11
scala> case class Emp(id:Int, name:String,sal:Int,sex:String,dname:String)
defined class Emp
scala> val e = emp.map { x =>
| val w = x.split(",")
| val id = w(0).toInt
| val name = w(1)
| val sal = w(2).toInt
| val sex = w(3)
| val dno = w(4).toInt
| val dname = dno match {
| case 11 => "Marketing"
| case 12 => "HR"
| case 13 => "Finance"
| case other => "Others"
| }
| val rec = Emp(id,name,sal,sex,dname)
| rec
| }
e: Array[Emp] = Array(Emp(101,aaaa,30000,m,Marketing), Emp(102,bbbb,50000,f,HR), Emp(103,hhh,60000,m,Marketing), Emp(104,qqq,8000,f,Marketing))
we separated individual elements from Array and passed them
into Emp case class.
scala> e.foreach(println)
Emp(101,aaaa,30000,m,Marketing)
Emp(102,bbbb,50000,f,HR)
Emp(103,hhh,60000,m,Marketing)
Emp(104,qqq,8000,f,Marketing)
// here we directly use the field names instead of position or index
scala> val pair5 = e.map(x => (x.dname,x.sal))
pair5: Array[(String, Int)] = Array((Marketing,30000), (HR,50000), (Marketing,60000), (Marketing,8000))
scala> pair5.foreach(println)
(Marketing,30000)
(HR,50000)
(Marketing,60000)
(Marketing,8000)
// case class
Advantage #1 : You can provide schema and access it using field names instead of position or index numbers
How to develop functions in Scala?
scala> def f : String = "Hello"
f: String
//Return type is string
//no input parameters
// if no () brackets we dont need to
scala> f
res4: String = Hello
scala> f.toString
res5: String = Hello
//Automatic return type as string
scala> def f = "Hello"
f: String
//automatic return type as string
//no input parameters
scala> def f = {
| val x = "heLLo"
| val y = x.toUpperCase
| y
| }
f: String
scala> f
res6: String = HELLO
scala> def add100(a:Int) : Int = a+100
add100: (a: Int)Int
scala> add100(1)
res7: Int = 101
scala> def add100(a:Int) = a+100
add100: (a: Int)Int
scala> add100(1)
res8: Int = 101
scala> def UpperConvert(a:String) = {
| val my = a.trim
| val word = my.split(" ")
| val words = word.filter(x => x != "")
| val output = words.mkString(" ")
| output.toUpperCase
| }
UpperConvert: (a: String)String
scala> UpperConvert(myText)
res1: String = HAI HELLO HOW ARE YOU
scala> val emp = Array("101,aaaa,30000,m,11", "102,bbbb,50000,f,12","103,hhh,60000,m,11", "104,qqq,8000,f,11")
emp: Array[String] = Array(101,aaaa,30000,m,11, 102,bbbb,50000,f,12, 103,hhh,60000,m,11, 104,qqq,8000,f,11)
scala> emp.foreach(println)
101,aaaa,30000,m,11
102,bbbb,50000,f,12
103,hhh,60000,m,11
104,qqq,8000,f,11
scala> def InitCap(x:String) = {
| val w = x.trim()
| val fc = w.slice(0,1).toUpperCase
| val rc = w.slice(1,w.size).toLowerCase
| val name = fc+rc
| name
| }
InitCap: (x: String)String
scala> InitCap(" i LoVE iNdIA ")
res5: String = I love india
scala> def fullGender(x:String) = {
| if (x.toUpperCase=="M") "Male" else "Female"
| }
fullGender: (x: String)String
scala> fullGender("m")
res6: String = Male
scala> fullGender("F")
res7: String = Female
scala> def grade(x:Int) = {
| if (x >70000) "A" else if (x >= 50000) "B" else if (x >= 30000) "C" else "D"
| }
grade: (x: Int)String
scala> grade(3000000)
res8: String = A
scala> grade(50001)
res9: String = B
scala> grade(30000)
res10: String = C
scala> grade(32)
res11: String = D
scala> def dept(dno:Int) = {
var dname = dno match {
case 11 => "Marketing"
case 12 => "HR"
case 13 => "Finance"
case other => "Others"
}
dname
}
scala> dept(11)
res12: String = Marketing
scala> dept(12)
res13: String = HR
scala> dept(13)
res14: String = Finance
scala> dept(14)
res15: String = Others
scala> case class Empl(id:Int, name:String, sal:Int, sex:String, dno:Int,dname:String,grade:String)
defined class Empl
def toEmp(line:String): Empl = {
val w = line.split(",")
val id = w(0).toInt
val name = InitCap(w(1))
val sal = w(2).toInt
val sex = fullGender(w(3))
val dno = w(4).toInt
val dname = dept(dno)
val grd = grade(sal)
val e = Empl(id,name,sal,sex,dno,dname,grd)
e
}
scala> toEmp("201,Amar,8000,m,11")
res16: Empl = Empl(201,Amar,8000,Male,11,Marketing,D)
scala> toEmp("201,Amar,80002,m,11")
res17: Empl = Empl(201,Amar,80002,Male,11,Marketing,A)
scala> emp.foreach(println)
101,aaaa,30000,m,11
102,bbbb,50000,f,12
103,hhh,60000,m,11
104,qqq,8000,f,11
scala> val emps = emp.map { x =>
| toEmp(x)
| }
emps: Array[Empl] = Array(Empl(101,Aaaa,30000,Male,11,Marketing,C), Empl(102,Bbbb,50000,Female,12,HR,B), Empl(103,Hhh,60000,Male,11,Marketing,B), Empl(104,Qqq,8000,Female,11,Marketing,D))
scala> val emps = emp.map (x => toEmp(x))
emps: Array[Empl] = Array(Empl(101,Aaaa,30000,Male,11,Marketing,C), Empl(102,Bbbb,50000,Female,12,HR,B), Empl(103,Hhh,60000,Male,11,Marketing,B), Empl(104,Qqq,8000,Female,11,Marketing,D))
scala> emps.foreach(println)
Empl(101,Aaaa,30000,Male,11,Marketing,C)
Empl(102,Bbbb,50000,Female,12,HR,B)
Empl(103,Hhh,60000,Male,11,Marketing,B)
Empl(104,Qqq,8000,Female,11,Marketing,D)
scala> def isMale(x:String) = {
| x.toUpperCase == "M"
| }
isMale: (x: String)Boolean
scala> isMale("m")
res20: Boolean = true
scala> isMale("f")
res21: Boolean = false
scala> val males = lst.filter(x => x != isMale(x))
males: List[String] = List(m, M, f, F, M, f, m, M)
scala> lst.filter(x => isMale(x))
res24: List[String] = List(m, M, M, m, M)
scala> val males = lst.filter(x => isMale(x))
males: List[String] = List(m, M, M, m, M)
scala> val females = lst.filter(x => !isMale(x))
females: List[String] = List(f, F, f)
scala> emp.foreach(println)
101,aaaa,30000,m,11
102,bbbb,50000,f,12
103,hhh,60000,m,11
104,qqq,8000,f,11
where SEX = "MALE"
scala> val ms = emp.filter(x => isMale(x.split(",")(3)))
ms: Array[String] = Array(101,aaaa,30000,m,11, 103,hhh,60000,m,11)
scala> val fms = emp.filter(x => !isMale(x.split(",")(3)))
fms: Array[String] = Array(102,bbbb,50000,f,12, 104,qqq,8000,f,11)
scala> val res = emps.filter(x => isMale(x.sex.slice(0,1)))
res: Array[Empl] = Array(Empl(101,Aaaa,30000,Male,11,Marketing,C), Empl(103,Hhh,60000,Male,11,Marketing,B))
scala> res.foreach(println)
Empl(101,Aaaa,30000,Male,11,Marketing,C)
Empl(103,Hhh,60000,Male,11,Marketing,B)
WHERE sex ="FEMALE"
scala> val res = emps.filter(x => !isMale(x.sex.slice(0,1)))
res: Array[Empl] = Array(Empl(102,Bbbb,50000,Female,12,HR,B), Empl(104,Qqq,8000,Female,11,Marketing,D))
scala> res.foreach(println)
Empl(102,Bbbb,50000,Female,12,HR,B)
Empl(104,Qqq,8000,Female,11,Marketing,D)
Recursive functions:
----------------------
scala> def power(x:Int, n:Int):Int = {
| if (n >= 1) x * power(x,n-1)
| else 1
| }
power: (x: Int, n: Int)Int
scala> power(1)
<console>:28: error: not enough arguments for method power: (x: Int, n: Int)Int.
Unspecified value parameter n.
power(1)
^
scala> power(3,2)
res32: Int = 9
scala> power(1,0)
res33: Int = 1
scala> power(0,0)
res34: Int = 1
scala> power(5,5)
res35: Int = 3125
5! = 5 * 4 * 3 * 2 * 1
scala> def fact(x:Int) : Int = {
if (x > 1) x * fact (x - 1)
else 1
}
fact: (x: Int)Int
scala> fact(5)
res36: Int = 120
scala> fact(0)
res37: Int = 1
scala> fact (10)
res38: Int = 3628800
scala> def fact(x:Long) : Long = {
| if (x > 1) x * fact (x - 1)
| else 1
| }
fact: (x: Long)Long
scala> fact(25)
res41: Long = 7034535277573963776
scala> val c : Char = '*'
c: Char = *
scala> val str : String = "Hello World!"
str: String = Hello World!
scala> val myStr : String = "10"
myStr: String = 10
scala> val myInt : Integer = myStr.toInt
myInt: Integer = 10
scala> val x : String = "10.34"
x: String = 10.34
scala> val y : Integer = x.toDouble.toInt
y: Integer = 10
Integer (Low Rank)
Double (High Rank)
Low Rank to High Rank (Allowed)
-------------------------------
(Integer to Double Allowed)
scala> val y : Double = 100
y: Double = 100.0
High Rank to Low Rank (Not Allowed)
------------------------------------
(Double to Integer Not allowed)
scala> val x : Int = 100.3
<console>:25: error: type mismatch;
found : Double(100.3)
required: Int
val x : Int = 100.3
^
Type Casting:
-------------
scala> val x : Integer = 100.34.toInt
x: Integer = 100
scala> val y : Integer = "100.234".toDouble.toInt
y: Integer = 100
Re assignment issues:
---------------------
scala> var x = 10
x: Int = 10
scala> x = "234"
<console>:27: error: type mismatch;
found : String("234")
required: Int
x = "234"
^
scala> x = 23.34
<console>:27: error: type mismatch;
found : Double(23.34)
required: Int
x = 23.34
^
Logical Operators:
-------------------
Active (Will check and verify all the conditions)
&& || !
Lazy
& |
scala> val sal = 2000
sal: Int = 2000
scala> sal > 1000 & sal < 3000
res46: Boolean = true
scala> sal > 234234
res47: Boolean = false
val dno = 13
ACTIVE ( It will execute all the conditions and return true)
scala> dno == 12 || dno == 13 || dno == 14 || dno == 15 || dno == 16
res48: Boolean = true
PASSIVE : It will execute 1st and 2nd conditions and return true immediately it wont validate after 2nd condition becoz its lazy
scala> dno == 12 | dno == 13 | dno == 14 | dno == 15 | dno == 16
res48: Boolean = true
first 2 conditions will results OR conditions as true,
so it wont evaluate further it will stop checking remaining conditions and
it will return true
scala> val sal = 40000
sal: Int = 40000
scala> val city = "hyd"
city: String = hyd
scala> val sex = "m"
sex: String = m
ACTIVE : it will evaluate all the conditions
scala> city =="hyd" && sal >= 50000 && sex =="m"
res50: Boolean = false
LAZY : It will evaluate just 2 conditions (sex =="m") wont get evaluated
scala> city == "hyd" & sal >= 50000 & sex =="m"
res51: Boolean = false
scala> dNo < 6 match {
| case true => "Working Day"
| case false => "Holiday"
| }
res64: String = Holiday
scala> val d = "sat"
d: String = sat
scala> val dd = d match {
| case "mon" | "tue" | "wed" | "thu" | "fri" => "Working Day"
| case other => "Holiday"
| }
dd: String = Holiday
Looping:
for (x <- 1 to 6) { println(x) }
1
2
3
4
5
6
scala> val name = List("Ravi","Rani","Vani","Veni","Varun")
name: List[String] = List(Ravi, Rani, Vani, Veni, Varun)
scala> for (x <- name)
| println(x)
Ravi
Rani
Vani
Veni
Varun
scala> name.foreach(println)
Ravi
Rani
Vani
Veni
Varun
scala> for (x <- name)
| {
| val y = x.toUpperCase
| println(y)
| }
RAVI
RANI
VANI
VENI
VARUN
name.map (x => x.toUpperCase).foreach(println)
RAVI
RANI
VANI
VENI
VARUN
String format:
----------------
scala> for (x <- 1 to 7)
| println(s"Day $x")
Day 1
Day 2
Day 3
Day 4
Day 5
Day 6
Day 7
scala> val days = 1 to 7
days: scala.collection.immutable.Range.Inclusive = Range(1, 2, 3, 4, 5, 6, 7)
scala> for (x <- days)
| {
| println("Day $x")
| }
Day $x
Day $x
Day $x
Day $x
Day $x
Day $x
Day $x
scala> for (x <- days)
| println(s"Day $x") // prefix s means special string formatting
Day 1
Day 2
Day 3
Day 4
Day 5
Day 6
Day 7
scala> val l = List(1,2,3,4,5,6,7,8,9,10)
l: List[Int] = List(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
scala> l.sum
res77: Int = 55
scala> l.sum / l.size
res78: Int = 5
Reduce:
----------
cumulatively evaluating:
------------------------
scala> l.reduce((x,y) => x+y)
res79: Int = 55
scala> l.reduce( (x,y) => Math.max(x,y))
res80: Int = 10
scala> l.reduce ( (x,y) => Math.max(x,y))
res82: Int = 10
scala> l.reduce( (x,y) => Math.min(x,y))
res83: Int = 1
No comments:
Post a Comment