Tuesday, 4 August 2020

External Table creation in Hive and loading 4 different data files located in HDFS

$ pwd
/home/cloudera/subjects

cat > list1.txt
id,subject_name,students_attended
100,Maths,10
101,Science,20

cat > list2.txt
id,subject_name,students_attended
103,Maths,5
104,Science,5

cat > list3.txt
id,subject_name,students_attended
106,Maths,25
107,Science,25

cat > list4.txt
id,subject_name,students_attended
109,Maths,11
110,Science,21

$ ls -lrt
total 16
-rw-rw-r-- 1 cloudera cloudera 79 Aug  4 05:24 list1.txt
-rw-rw-r-- 1 cloudera cloudera 77 Aug  4 05:24 list2.txt
-rw-rw-r-- 1 cloudera cloudera 79 Aug  4 05:25 list3.txt
-rw-rw-r-- 1 cloudera cloudera 79 Aug  4 05:25 list4.txt

$ cat *
id,subject_name,students_attended
100,Maths,10
101,Science,20
id,subject_name,students_attended
103,Maths,5
104,Science,5
id,subject_name,students_attended
106,Maths,25
107,Science,25
id,subject_name,students_attended
109,Maths,11
110,Science,21



$ hdfs dfs -mkdir /user/cloudera/subjects-data
$ hdfs dfs -copyFromLocal *.* /user/cloudera/subjects-data/

$ hdfs dfs -ls /user/cloudera/subjects-data/
Found 4 items
-rw-r--r--   1 cloudera supergroup         79 2020-08-04 05:41 /user/cloudera/subjects-data/list1.txt
-rw-r--r--   1 cloudera supergroup         77 2020-08-04 05:41 /user/cloudera/subjects-data/list2.txt
-rw-r--r--   1 cloudera supergroup         79 2020-08-04 05:41 /user/cloudera/subjects-data/list3.txt
-rw-r--r--   1 cloudera supergroup         79 2020-08-04 05:41 /user/cloudera/subjects-data/list4.txt




create external table ohm.subject
(
id int,
subject_name string,
students_attended int
)
row format delimited
fields terminated by ",";
LOCATION 



hive> LOAD DATA LOCAL INPATH '/home/cloudera/subjects/' INTO TABLE ohm.subject;
Loading data to table ohm.subject
Table ohm.subject stats: [numFiles=4, totalSize=314]
OK
Time taken: 0.97 second

// Here it shows headers in between because each file has its own header
hive> select * from ohm.subject;
OK
NULL subject_name NULL
100 Maths 10
101 Science 20
NULL subject_name NULL
103 Maths 5
104 Science 5
NULL subject_name NULL
106 Maths 25
107 Science 25
NULL subject_name NULL
109 Maths 11
110 Science 21
Time taken: 0.511 seconds, Fetched: 16 row(s)

// droping the table and going to re-create it
hive> drop table ohm.subject;
OK
Time taken: 0.189 seconds

// skipping the header lines
// LOCATION specified in hdfs path
create external table ohm.subject
(
id int,
subject_name string,
students_attended int
)
row format delimited
fields terminated by ","
STORED AS TEXTFILE
LOCATION '/user/cloudera/subjects-data/'
TBLPROPERTIES("skip.header.line.count"="1");


hive> select * from ohm.subject;
OK
100 Maths 10
101 Science 20
103 Maths 5
104 Science 5
106 Maths 25
107 Science 25
109 Maths 11
110 Science 21
Time taken: 0.096 seconds, Fetched: 8 row(s)


'//Aggregation operation in Hive


hive> select subject_name,sum(students_attended) from ohm.subject group by subject_name;

OK
Maths 51
Science 71



hive> select subject_name,sum(students_attended), count(subject_name) from ohm.subject group by subject_name;

Maths 51 4
Science 71 4

No comments:

Post a Comment

Flume - Simple Demo

// create a folder in hdfs : $ hdfs dfs -mkdir /user/flumeExa // Create a shell script which generates : Hadoop in real world <n>...