Sankara's Big Data Notes: pandas

#Series Example with 0,1,2,3,. are indexes

import pandas as pd

s1 = pd.Series([1,2,3,4,5,6,7,8,9,10])

print(s1)

0 1

1 2

2 3

3 4

4 5

5 6

6 7

7 8

8 9

9 10

dtype: int64

type(s1)

pandas.core.series.Series

#Series example with Custom indexes

import pandas as pd

s1 = pd.Series([86,63,85,81,90],index=["Tamil","English","Maths","Science","Social"])

print(s1)

Tamil 86

English 63

Maths 85

Science 81

Social 90

dtype: int64

#Passing dictionary object to the Series

#Keys of a given Dictionary will become Indexes

import pandas as pd

subjectDict = {"Tamil":85, "English":63, "Maths":85, "Science":81, "Social":90}

s1 = pd.Series(subjectDict)

print(s1)

Tamil 85

English 63

Maths 85

Science 81

Social 90

dtype: int64

# 'b' and 'd' are not there in the given indexes, so, NaN as the value assigned to them

import pandas as pd

s1 = pd.Series({"a":10,"c":30,"e":40},index=["b","c","d","a"])

print(s1)

b NaN

c 30.0

d NaN

a 10.0

dtype: float64

s1 = pd.Series([5,7,3,2,88,22,-1,0,33])

print(s1[3])

print(s1[:2])

0 5

1 7

dtype: int64

print(s1[-1:])

8 33

dtype: int64

print(s1[:6])

0 5

1 7

2 3

3 2

4 88

5 22

dtype: int64

#Arithmetic operations

s1 = pd.Series([10,20,30,40])

s2 = pd.Series([11,22,33,44])

s3 = s1 + s2

print(s3)

0 21

1 42

2 63

3 84

dtype: int64

s1 = pd.Series([11,66,77,55])

s2 = pd.Series([5,22,22,44])

s3 = s1 - s2

print(s3)

0 6

1 44

2 55

3 11

dtype: int64

print(s1+15)

0 26

1 81

2 92

3 70

dtype: int64

print(s2 ** 1.3)

0 8.103283

1 55.609563

2 55.609563

3 136.926807

dtype: float64

s1 = pd.Series([1,2,3])

s2 = pd.Series([6,7,8])

print(s1,s2)

0 1

1 2

2 3

dtype: int64 0 6

1 7

2 8

dtype: int64

print(s1+s2)

0 7

1 9

2 11

dtype: int64

print(s1*s2)

0 6

1 14

2 24

dtype: int64

print(s1-s2, s2-s1)

0 -5

1 -5

2 -5

dtype: int64 0 5

1 5

2 5

dtype: int64

#DataFrame Example

import pandas as pd

subjectDict = {"Subjects":["Tamil","English","Maths","Science","Social"],"Marks":[86,63,85,81,90]}

df = pd.DataFrame(subjectDict)

print(df)

Subjects Marks

0 Tamil 86

1 English 63

2 Maths 85

3 Science 81

4 Social 90

import pandas as pd

subjectDict = {"Names":["Arjun","Ram","Biswa","Kalai","Nila"],"Age":[78,37,88,43,93]}

df = pd.DataFrame(subjectDict)

print(df)

Names Age

0 Arjun 78

1 Ram 37

2 Biswa 88

3 Kalai 43

4 Nila 93

df = pd.read_csv("https://gist.githubusercontent.com/netj/8836201/raw/6f9306ad21398ea43cba4f7d537619d0e07d5ae3/iris.csv")

df.head()

df = pd.read_csv("E:\\PyExa\\iris.csv")

df.head()

sepal.length sepal.width petal.length petal.width variety

0 5.1 3.5 1.4 0.2 Setosa

1 4.9 3.0 1.4 0.2 Setosa

2 4.7 3.2 1.3 0.2 Setosa

3 4.6 3.1 1.5 0.2 Setosa

4 5.0 3.6 1.4 0.2 Setosa

df.tail()

sepal.length sepal.width petal.length petal.width variety

145 6.7 3.0 5.2 2.3 Virginica

146 6.3 2.5 5.0 1.9 Virginica

147 6.5 3.0 5.2 2.0 Virginica

148 6.2 3.4 5.4 2.3 Virginica

149 5.9 3.0 5.1 1.8 Virginica

print(df.shape)

(150, 5) # 150 X 5 ==> 150 Rows X 5 columns

df.describe()

sepal.length sepal.width petal.length petal.width

count 150.000000 150.000000 150.000000 150.000000

mean 5.843333 3.057333 3.758000 1.199333

std 0.828066 0.435866 1.765298 0.762238

min 4.300000 2.000000 1.000000 0.100000

25% 5.100000 2.800000 1.600000 0.300000

50% 5.800000 3.000000 4.350000 1.300000

75% 6.400000 3.300000 5.100000 1.800000

max 7.900000 4.400000 6.900000 2.500000

df.iloc[0:3,0:2]

#1st 3 Rows and 1st 2 Columns

sepal.length sepal.width

0 5.1 3.5

1 4.9 3.0

2 4.7 3.2

df.iloc[0:4,0:4]

#1st 4 Rows and 1st 4 Columns

sepal.length sepal.width petal.length petal.width

0 5.1 3.5 1.4 0.2

1 4.9 3.0 1.4 0.2

2 4.7 3.2 1.3 0.2

3 4.6 3.1 1.5 0.2

df.loc[0:7,("petal.length","petal.width","variety")]

#1st 7 rows and specified columns

petal.length petal.width variety

0 1.4 0.2 Setosa

1 1.4 0.2 Setosa

2 1.3 0.2 Setosa

3 1.5 0.2 Setosa

4 1.4 0.2 Setosa

5 1.7 0.4 Setosa

6 1.4 0.3 Setosa

7 1.5 0.2 Setosa

#Drop variety column in the dataframe

s1 = df.drop("variety",axis=1)

print (s1.head())

sepal.length sepal.width petal.length petal.width

0 5.1 3.5 1.4 0.2

1 4.9 3.0 1.4 0.2

2 4.7 3.2 1.3 0.2

3 4.6 3.1 1.5 0.2

4 5.0 3.6 1.4 0.2

#Drop 3 rows

s1 = df.drop([1,2,3],axis=0)

print(s1.head())

sepal.length sepal.width petal.length petal.width variety

0 5.1 3.5 1.4 0.2 Setosa

4 5.0 3.6 1.4 0.2 Setosa

5 5.4 3.9 1.7 0.4 Setosa

6 4.6 3.4 1.4 0.3 Setosa

7 5.0 3.4 1.5 0.2 Setosa

df.mean()

sepal.length 5.843333

sepal.width 3.057333

petal.length 3.758000

petal.width 1.199333

dtype: float64

df.median()

sepal.length 5.80

sepal.width 3.00

petal.length 4.35

petal.width 1.30

dtype: float64

df.min()

sepal.length 4.3

sepal.width 2

petal.length 1

petal.width 0.1

variety Setosa

dtype: object

df.max()

sepal.length 7.9

sepal.width 4.4

petal.length 6.9

petal.width 2.5

variety Virginica

dtype: object

#applying user defined function

def half(s):

return s*0.5

s1 = df[["sepal.length","petal.length"]].apply(half) #half is the udf

print(df[["sepal.length","petal.length"]].head())

print(s1.head())

sepal.length petal.length

0 5.1 1.4

1 4.9 1.4

2 4.7 1.3

3 4.6 1.5

4 5.0 1.4

sepal.length petal.length

0 2.55 0.70

1 2.45 0.70

2 2.35 0.65

3 2.30 0.75

4 2.50 0.70

#user defined function to double the dataframe values

def double_make(s):

return s*2

print(df[["sepal.width","petal.width"]].head(5))

s1 = df[["sepal.width","petal.width"]].apply(double_make)

print(s1.head())

sepal.width petal.width

0 3.5 0.2

1 3.0 0.2

2 3.2 0.2

3 3.1 0.2

4 3.6 0.2

sepal.width petal.width

0 7.0 0.4

1 6.0 0.4

2 6.4 0.4

3 6.2 0.4

4 7.2 0.4

#grouping and counting of particular column

s1 = df["variety"].value_counts()

print(s1)

Virginica 50

Setosa 50

Versicolor 50

Name: variety, dtype: int64

#Sort order

df.sort_values(by="sepal.length").head()

sepal.length sepal.width petal.length petal.width variety

13 4.3 3.0 1.1 0.1 Setosa

42 4.4 3.2 1.3 0.2 Setosa

38 4.4 3.0 1.3 0.2 Setosa

8 4.4 2.9 1.4 0.2 Setosa

41 4.5 2.3 1.3 0.3 Setosa

Sankara's Big Data Notes

Friday, 15 May 2020

Introduction to Pandas - Sample Programs

Flume - Simple Demo