$ pwd
/home/cloudera/subjects
cat > list1.txt
id,subject_name,students_attended
100,Maths,10
101,Science,20
cat > list2.txt
id,subject_name,students_attended
103,Maths,5
104,Science,5
cat > list3.txt
id,subject_name,students_attended
106,Maths,25
107,Science,25
cat > list4.txt
id,subject_name,students_attended
109,Maths,11
110,Science,21
$ ls -lrt
total 16
-rw-rw-r-- 1 cloudera cloudera 79 Aug 4 05:24 list1.txt
-rw-rw-r-- 1 cloudera cloudera 77 Aug 4 05:24 list2.txt
-rw-rw-r-- 1 cloudera cloudera 79 Aug 4 05:25 list3.txt
-rw-rw-r-- 1 cloudera cloudera 79 Aug 4 05:25 list4.txt
$ cat *
id,subject_name,students_attended
100,Maths,10
101,Science,20
id,subject_name,students_attended
103,Maths,5
104,Science,5
id,subject_name,students_attended
106,Maths,25
107,Science,25
id,subject_name,students_attended
109,Maths,11
110,Science,21
$ hdfs dfs -mkdir /user/cloudera/subjects-data
$ hdfs dfs -copyFromLocal *.* /user/cloudera/subjects-data/
$ hdfs dfs -ls /user/cloudera/subjects-data/
Found 4 items
-rw-r--r-- 1 cloudera supergroup 79 2020-08-04 05:41 /user/cloudera/subjects-data/list1.txt
-rw-r--r-- 1 cloudera supergroup 77 2020-08-04 05:41 /user/cloudera/subjects-data/list2.txt
-rw-r--r-- 1 cloudera supergroup 79 2020-08-04 05:41 /user/cloudera/subjects-data/list3.txt
-rw-r--r-- 1 cloudera supergroup 79 2020-08-04 05:41 /user/cloudera/subjects-data/list4.txt
create external table ohm.subject
(
id int,
subject_name string,
students_attended int
)
row format delimited
fields terminated by ",";
LOCATION
hive> LOAD DATA LOCAL INPATH '/home/cloudera/subjects/' INTO TABLE ohm.subject;
Loading data to table ohm.subject
Table ohm.subject stats: [numFiles=4, totalSize=314]
OK
Time taken: 0.97 second
// Here it shows headers in between because each file has its own header
hive> select * from ohm.subject;
OK
NULL subject_name NULL
100 Maths 10
101 Science 20
NULL subject_name NULL
103 Maths 5
104 Science 5
NULL subject_name NULL
106 Maths 25
107 Science 25
NULL subject_name NULL
109 Maths 11
110 Science 21
Time taken: 0.511 seconds, Fetched: 16 row(s)
// droping the table and going to re-create it
hive> drop table ohm.subject;
OK
Time taken: 0.189 seconds
// skipping the header lines
// LOCATION specified in hdfs path
create external table ohm.subject
(
id int,
subject_name string,
students_attended int
)
row format delimited
fields terminated by ","
STORED AS TEXTFILE
LOCATION '/user/cloudera/subjects-data/'
TBLPROPERTIES("skip.header.line.count"="1");
hive> select * from ohm.subject;
OK
100 Maths 10
101 Science 20
103 Maths 5
104 Science 5
106 Maths 25
107 Science 25
109 Maths 11
110 Science 21
Time taken: 0.096 seconds, Fetched: 8 row(s)
'//Aggregation operation in Hive
hive> select subject_name,sum(students_attended) from ohm.subject group by subject_name;
OK
Maths 51
Science 71
hive> select subject_name,sum(students_attended), count(subject_name) from ohm.subject group by subject_name;
Maths 51 4
Science 71 4
No comments:
Post a Comment