Saturday, 1 August 2020

Convert Single column into Multiple Column in Hive

//  we have single column : FullName with multiple commas (,) inside 
// FirstName, MiddleName, LastName
input file : col1tocol3.txt:
------------------------------
id|fullname
100|Sankara,Narayanan,Ramaiah
101|Anbu,Sudha,Dhanukkodi
102|Naga,Rathinam,Dhanukkodi
103|Bala,Subramanian,Ramaiah
105|Seetha,Raman,Balasubramanian
106|Subramanian,Nagappan,Pallathur
107|Swaroop,Venkatesh,Bangalore



hdfs dfs -cat hdfs://localhost:8020/sankara/fullnameexa/fullnamefile.txt
id|fullname
100|Sankara,Narayanan,Ramaiah
101|Anbu,Sudha,Dhanukkodi
102|Naga,Rathinam,Dhanukkodi
103|Bala,Subramanian,Ramaiah
105|Seetha,Raman,Balasubramanian
106|Subramanian,Nagappan,Pallathur
107|Swaroop,Venkatesh,Bangalore



CREATE EXTERNAL TABLE IF NOT EXISTS ohm.tmptbl_fullname
(
id int,
fullname string
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '|'
STORED AS TEXTFILE
LOCATION 'hdfs://localhost:8020/sankara/fullnameexa/'
TBLPROPERTIES("skip.header.line.count"="1");



hive> select * from ohm.tmptbl_fullname;
OK
100 Sankara,Narayanan,Ramaiah
101 Anbu,Sudha,Dhanukkodi
102 Naga,Rathinam,Dhanukkodi
103 Bala,Subramanian,Ramaiah
105 Seetha,Raman,Balasubramanian
106 Subramanian,Nagappan,Pallathur
107 Swaroop,Venkatesh,Bangalore
Time taken: 0.689 seconds, Fetched: 7 row(s)


//select query with split logic
hive> select split(fullname,',') as fmlname from ohm.tmptbl_fullname;
OK
["Sankara","Narayanan","Ramaiah"]
["Anbu","Sudha","Dhanukkodi"]
["Naga","Rathinam","Dhanukkodi"]
["Bala","Subramanian","Ramaiah"]
["Seetha","Raman","Balasubramanian"]
["Subramanian","Nagappan","Pallathur"]
["Swaroop","Venkatesh","Bangalore"]


hive> create table ohm.tblfirstmiddlelastnames (id int, first string, middle string, last string);
OK
Time taken: 0.252 seconds
hive> show tables;
OK
employee
sample_1
tbl_avro
tbl_header_removed
tbl_parquet
tblfirstmiddlelastnames
tmptbl_fullname
Time taken: 0.022 seconds, Fetched: 7 row(s)



hive> select tblfull.id as id, tblfull.fmlname[0] as first, tblfull.fmlname[1] as middle, tblfull.fmlname[2] as last from (select id,split(fullname,',') as fmlname from ohm.tmptbl_fullname) as tblfull;
OK
100 Sankara Narayanan Ramaiah
101 Anbu Sudha Dhanukkodi
102 Naga Rathinam Dhanukkodi
103 Bala Subramanian Ramaiah
105 Seetha Raman Balasubramanian
106 Subramanian Nagappan Pallathur
107 Swaroop Venkatesh Bangalore

set mapred.job.queue.name=long_running;


insert into ohm.tblfirstmiddlelastnames  select tblfull.id as id, tblfull.fmlname[0] as first, tblfull.fmlname[1] as middle, tblfull.fmlname[2] as last from (select id,split(fullname,',') as fmlname from ohm.tmptbl_fullname) as tblfull;

hive> select * from  ohm.tblfirstmiddlelastnames;
OK
100 Sankara Narayanan Ramaiah
101 Anbu Sudha Dhanukkodi
102 Naga Rathinam Dhanukkodi
103 Bala Subramanian Ramaiah
105 Seetha Raman Balasubramanian
106 Subramanian Nagappan Pallathur
107 Swaroop Venkatesh Bangalore
Time taken: 0.085 seconds, Fetched: 7 row(s)

No comments:

Post a Comment

Flume - Simple Demo

// create a folder in hdfs : $ hdfs dfs -mkdir /user/flumeExa // Create a shell script which generates : Hadoop in real world <n>...