Friday, 7 August 2020

Partitioned table creation in Hive

// How to create a partitioned table in Hive?


// This one is the source table which is non partitioned 

CREATE EXTERNAL TABLE orders

(

orderid INT,

date STRING,

custid INT,

status STRING

)

row format delimited

fields terminated by ","

location '/user/cloudera/orders'



hive> describe orders;

OK

order_id            int                                      

order_date          bigint                                  

order_customer_id    int                                      

order_status        string                                  

Time taken: 0.051 seconds, Fetched: 4 row(s)


Location:            hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders



// find the unique statuses 

hive> select distinct order_status from orders;


CANCELED

CLOSED

COMPLETE

ON_HOLD

PAYMENT_REVIEW

PENDING

PENDING_PAYMENT

PROCESSING

SUSPECTED_FRAUD



// set the properties

set hive.exec.dynamic.partition=true;

set hive.exec.dynamic.partition.mode=nonstrict;


// Create partition table based on status - so do not include status column in the table structure

create external table orders_partitioned

(

orderid int,

date string,

custid int

)

partitioned by (status string)

row format delimited

fields terminated by ",";




# Partition Information  

# col_name            data_type            comment             

 

status              string                                  

 

Location:            hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned  


Table Parameters:  

EXTERNAL            TRUE                

numPartitions        0        ---> No data available so numPartitions is zero as of now.            

transient_lastDdlTime 1596822311  

//Populate partitions with data taken from another table


insert overwrite table orders_partitioned

partition (status)

select order_id, order_date,order_customer_id, order_status from orders;



Loading partition {status=PENDING}

Loading partition {status=COMPLETE}

Loading partition {status=CANCELED}

Loading partition {status=CLOSED}

Loading partition {status=PROCESSING}

Loading partition {status=PAYMENT_REVIEW}

Loading partition {status=SUSPECTED_FRAUD}

Loading partition {status=ON_HOLD}

Loading partition {status=PENDING_PAYMENT}

Table Parameters:  

EXTERNAL            TRUE                

numPartitions        9                   -- We have populated the data taken from another table. so numPartitions is 9

transient_lastDdlTime 1596822311  


Location:            hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned  

//Find the files in HDFS after populating the data :

hdfs dfs -ls hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned

Found 9 items

drwxrwxrwx   - cloudera supergroup          0 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=CANCELED

drwxrwxrwx   - cloudera supergroup          0 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=CLOSED

drwxrwxrwx   - cloudera supergroup          0 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=COMPLETE

drwxrwxrwx   - cloudera supergroup          0 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=ON_HOLD

drwxrwxrwx   - cloudera supergroup          0 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=PAYMENT_REVIEW

drwxrwxrwx   - cloudera supergroup          0 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=PENDING

drwxrwxrwx   - cloudera supergroup          0 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=PENDING_PAYMENT

drwxrwxrwx   - cloudera supergroup          0 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=PROCESSING

drwxrwxrwx   - cloudera supergroup          0 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=SUSPECTED_FRAUD



hdfs dfs -ls hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=COMPLETE

Found 1 items

-rwxrwxrwx   1 cloudera supergroup     571301 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=COMPLETE/000000_0


$ hdfs dfs -cat hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=COMPLETE/000000_0 | head 

3,1374735600000,12111

5,1374735600000,11318

6,1374735600000,7130

7,1374735600000,4530

15,1374735600000,2568

17,1374735600000,2667

22,1374735600000,333

26,1374735600000,7562

28,1374735600000,656

32,1374735600000,3960



// Performance Analysis 


// Find the line count

$ hdfs dfs -cat hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=COMPLETE/000000_0 | wc -l

22899


// Find the row count against source table which is not a partitioned one

hive> select count(1) from orders where order_status ='COMPLETE';

OK

22899

MapReduce Jobs Launched: 

Stage-Stage-1: Map: 1  Reduce: 1   Cumulative CPU: 5.95 sec   HDFS Read: 45693 HDFS Write: 6 SUCCESS

Total MapReduce CPU Time Spent: 5 seconds 950 msec   ---> 6 SECONDS



// Find the row count of a partitioned table 

hive> select count(1) from orders_partitioned where status ='COMPLETE';


MapReduce Jobs Launched: 

Stage-Stage-1: Map: 1  Reduce: 1   Cumulative CPU: 3.38 sec   HDFS Read: 579428 HDFS Write: 6 SUCCESS

Total MapReduce CPU Time Spent: 3 seconds 380 msec  ----> 3.4 SECONDS 



hive> show partitions ohm.orders_partitioned;

OK 

status=CANCELED

status=CLOSED

status=COMPLETE

status=ON_HOLD

status=PAYMENT_REVIEW

status=PENDING

status=PENDING_PAYMENT

status=PROCESSING

status=SUSPECTED_FRAUD

Time taken: 0.113 seconds, Fetched: 9 row(s)





No comments:

Post a Comment

Flume - Simple Demo

// create a folder in hdfs : $ hdfs dfs -mkdir /user/flumeExa // Create a shell script which generates : Hadoop in real world <n>...