Sankara's Big Data Notes: Partitioned table creation in Hive

// How to create a partitioned table in Hive?

// This one is the source table which is non partitioned

CREATE EXTERNAL TABLE orders

(

orderid INT,

date STRING,

custid INT,

status STRING

)

row format delimited

fields terminated by ","

location '/user/cloudera/orders'

hive> describe orders;

order_id int

order_date bigint

order_customer_id int

order_status string

Time taken: 0.051 seconds, Fetched: 4 row(s)

Location: hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders

// find the unique statuses

hive> select distinct order_status from orders;

CANCELED

CLOSED

COMPLETE

ON_HOLD

PAYMENT_REVIEW

PENDING

PENDING_PAYMENT

PROCESSING

SUSPECTED_FRAUD

// set the properties

set hive.exec.dynamic.partition=true;

set hive.exec.dynamic.partition.mode=nonstrict;

// Create partition table based on status - so do not include status column in the table structure

create external table orders_partitioned

(

orderid int,

date string,

custid int

)

partitioned by (status string)

row format delimited

fields terminated by ",";

# Partition Information

# col_name data_type comment

status string

Location: hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned

Table Parameters:

EXTERNAL TRUE

numPartitions 0 ---> No data available so numPartitions is zero as of now.

transient_lastDdlTime 1596822311

//Populate partitions with data taken from another table

insert overwrite table orders_partitioned

partition (status)

select order_id, order_date,order_customer_id, order_status from orders;

Loading partition {status=PENDING}

Loading partition {status=COMPLETE}

Loading partition {status=CANCELED}

Loading partition {status=CLOSED}

Loading partition {status=PROCESSING}

Loading partition {status=PAYMENT_REVIEW}

Loading partition {status=SUSPECTED_FRAUD}

Loading partition {status=ON_HOLD}

Loading partition {status=PENDING_PAYMENT}

Table Parameters:

EXTERNAL TRUE

numPartitions 9 -- We have populated the data taken from another table. so numPartitions is 9

transient_lastDdlTime 1596822311

Location: hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned

//Find the files in HDFS after populating the data :

hdfs dfs -ls hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned

Found 9 items

drwxrwxrwx - cloudera supergroup 0 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=CANCELED

drwxrwxrwx - cloudera supergroup 0 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=CLOSED

drwxrwxrwx - cloudera supergroup 0 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=COMPLETE

drwxrwxrwx - cloudera supergroup 0 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=ON_HOLD

drwxrwxrwx - cloudera supergroup 0 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=PAYMENT_REVIEW

drwxrwxrwx - cloudera supergroup 0 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=PENDING

drwxrwxrwx - cloudera supergroup 0 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=PENDING_PAYMENT

drwxrwxrwx - cloudera supergroup 0 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=PROCESSING

drwxrwxrwx - cloudera supergroup 0 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=SUSPECTED_FRAUD

hdfs dfs -ls hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=COMPLETE

Found 1 items

-rwxrwxrwx 1 cloudera supergroup 571301 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=COMPLETE/000000_0

$ hdfs dfs -cat hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=COMPLETE/000000_0 | head

3,1374735600000,12111

5,1374735600000,11318

6,1374735600000,7130

7,1374735600000,4530

15,1374735600000,2568

17,1374735600000,2667

22,1374735600000,333

26,1374735600000,7562

28,1374735600000,656

32,1374735600000,3960

// Performance Analysis

// Find the line count

$ hdfs dfs -cat hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=COMPLETE/000000_0 | wc -l

22899

// Find the row count against source table which is not a partitioned one

hive> select count(1) from orders where order_status ='COMPLETE';

22899

MapReduce Jobs Launched:

Stage-Stage-1: Map: 1 Reduce: 1 Cumulative CPU: 5.95 sec HDFS Read: 45693 HDFS Write: 6 SUCCESS

Total MapReduce CPU Time Spent: 5 seconds 950 msec ---> 6 SECONDS

// Find the row count of a partitioned table

hive> select count(1) from orders_partitioned where status ='COMPLETE';

MapReduce Jobs Launched:

Stage-Stage-1: Map: 1 Reduce: 1 Cumulative CPU: 3.38 sec HDFS Read: 579428 HDFS Write: 6 SUCCESS

Total MapReduce CPU Time Spent: 3 seconds 380 msec ----> 3.4 SECONDS

hive> show partitions ohm.orders_partitioned;

status=CANCELED

status=CLOSED

status=COMPLETE

status=ON_HOLD

status=PAYMENT_REVIEW

status=PENDING

status=PENDING_PAYMENT

status=PROCESSING

status=SUSPECTED_FRAUD

Time taken: 0.113 seconds, Fetched: 9 row(s)

Sankara's Big Data Notes

Friday, 7 August 2020

Partitioned table creation in Hive

No comments:

Post a Comment

Flume - Simple Demo