Sankara's Big Data Notes: External Table creation in Hive and loading 4 different data files located in HDFS

Tuesday, 4 August 2020

External Table creation in Hive and loading 4 different data files located in HDFS

$ pwd

/home/cloudera/subjects

cat > list1.txt

id,subject_name,students_attended

100,Maths,10

101,Science,20

cat > list2.txt

id,subject_name,students_attended

103,Maths,5

104,Science,5

cat > list3.txt

id,subject_name,students_attended

106,Maths,25

107,Science,25

cat > list4.txt

id,subject_name,students_attended

109,Maths,11

110,Science,21

$ ls -lrt

total 16

-rw-rw-r-- 1 cloudera cloudera 79 Aug 4 05:24 list1.txt

-rw-rw-r-- 1 cloudera cloudera 77 Aug 4 05:24 list2.txt

-rw-rw-r-- 1 cloudera cloudera 79 Aug 4 05:25 list3.txt

-rw-rw-r-- 1 cloudera cloudera 79 Aug 4 05:25 list4.txt

$ cat *

id,subject_name,students_attended

100,Maths,10

101,Science,20

id,subject_name,students_attended

103,Maths,5

104,Science,5

id,subject_name,students_attended

106,Maths,25

107,Science,25

id,subject_name,students_attended

109,Maths,11

110,Science,21

$ hdfs dfs -mkdir /user/cloudera/subjects-data

$ hdfs dfs -copyFromLocal *.* /user/cloudera/subjects-data/

$ hdfs dfs -ls /user/cloudera/subjects-data/

Found 4 items

-rw-r--r-- 1 cloudera supergroup 79 2020-08-04 05:41 /user/cloudera/subjects-data/list1.txt

-rw-r--r-- 1 cloudera supergroup 77 2020-08-04 05:41 /user/cloudera/subjects-data/list2.txt

-rw-r--r-- 1 cloudera supergroup 79 2020-08-04 05:41 /user/cloudera/subjects-data/list3.txt

-rw-r--r-- 1 cloudera supergroup 79 2020-08-04 05:41 /user/cloudera/subjects-data/list4.txt

create external table ohm.subject

(

id int,

subject_name string,

students_attended int

)

row format delimited

fields terminated by ",";

LOCATION

hive> LOAD DATA LOCAL INPATH '/home/cloudera/subjects/' INTO TABLE ohm.subject;

Loading data to table ohm.subject

Table ohm.subject stats: [numFiles=4, totalSize=314]

Time taken: 0.97 second

// Here it shows headers in between because each file has its own header

hive> select * from ohm.subject;

NULL subject_name NULL

100 Maths 10

101 Science 20

NULL subject_name NULL

103 Maths 5

104 Science 5

NULL subject_name NULL

106 Maths 25

107 Science 25

NULL subject_name NULL

109 Maths 11

110 Science 21

Time taken: 0.511 seconds, Fetched: 16 row(s)

// droping the table and going to re-create it

hive> drop table ohm.subject;

Time taken: 0.189 seconds

// skipping the header lines

// LOCATION specified in hdfs path

create external table ohm.subject

(

id int,

subject_name string,

students_attended int

)

row format delimited

fields terminated by ","

STORED AS TEXTFILE

LOCATION '/user/cloudera/subjects-data/'

TBLPROPERTIES("skip.header.line.count"="1");

hive> select * from ohm.subject;

100 Maths 10

101 Science 20

103 Maths 5

104 Science 5

106 Maths 25

107 Science 25

109 Maths 11

110 Science 21

Time taken: 0.096 seconds, Fetched: 8 row(s)

'//Aggregation operation in Hive

hive> select subject_name,sum(students_attended) from ohm.subject group by subject_name;

Maths 51

Science 71

hive> select subject_name,sum(students_attended), count(subject_name) from ohm.subject group by subject_name;

Maths 51 4

Science 71 4

Sankara's Big Data Notes

Tuesday, 4 August 2020

External Table creation in Hive and loading 4 different data files located in HDFS

No comments:

Post a Comment

Flume - Simple Demo