// How to create a partitioned table in Hive?
// This one is the source table which is non partitioned
CREATE EXTERNAL TABLE orders
(
orderid INT,
date STRING,
custid INT,
status STRING
)
row format delimited
fields terminated by ","
location '/user/cloudera/orders'
hive> describe orders;
OK
order_id int
order_date bigint
order_customer_id int
order_status string
Time taken: 0.051 seconds, Fetched: 4 row(s)
Location: hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders
// find the unique statuses
hive> select distinct order_status from orders;
CANCELED
CLOSED
COMPLETE
ON_HOLD
PAYMENT_REVIEW
PENDING
PENDING_PAYMENT
PROCESSING
SUSPECTED_FRAUD
// set the properties
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
// Create partition table based on status - so do not include status column in the table structure
create external table orders_partitioned
(
orderid int,
date string,
custid int
)
partitioned by (status string)
row format delimited
fields terminated by ",";
# Partition Information
# col_name data_type comment
status string
Location: hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned
Table Parameters:
EXTERNAL TRUE
numPartitions 0 ---> No data available so numPartitions is zero as of now.
transient_lastDdlTime 1596822311
//Populate partitions with data taken from another table
insert overwrite table orders_partitioned
partition (status)
select order_id, order_date,order_customer_id, order_status from orders;
Loading partition {status=PENDING}
Loading partition {status=COMPLETE}
Loading partition {status=CANCELED}
Loading partition {status=CLOSED}
Loading partition {status=PROCESSING}
Loading partition {status=PAYMENT_REVIEW}
Loading partition {status=SUSPECTED_FRAUD}
Loading partition {status=ON_HOLD}
Loading partition {status=PENDING_PAYMENT}
Table Parameters:
EXTERNAL TRUE
numPartitions 9 -- We have populated the data taken from another table. so numPartitions is 9
transient_lastDdlTime 1596822311
Location: hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned
//Find the files in HDFS after populating the data :
hdfs dfs -ls hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned
Found 9 items
drwxrwxrwx - cloudera supergroup 0 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=CANCELED
drwxrwxrwx - cloudera supergroup 0 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=CLOSED
drwxrwxrwx - cloudera supergroup 0 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=COMPLETE
drwxrwxrwx - cloudera supergroup 0 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=ON_HOLD
drwxrwxrwx - cloudera supergroup 0 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=PAYMENT_REVIEW
drwxrwxrwx - cloudera supergroup 0 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=PENDING
drwxrwxrwx - cloudera supergroup 0 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=PENDING_PAYMENT
drwxrwxrwx - cloudera supergroup 0 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=PROCESSING
drwxrwxrwx - cloudera supergroup 0 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=SUSPECTED_FRAUD
hdfs dfs -ls hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=COMPLETE
Found 1 items
-rwxrwxrwx 1 cloudera supergroup 571301 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=COMPLETE/000000_0
$ hdfs dfs -cat hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=COMPLETE/000000_0 | head
3,1374735600000,12111
5,1374735600000,11318
6,1374735600000,7130
7,1374735600000,4530
15,1374735600000,2568
17,1374735600000,2667
22,1374735600000,333
26,1374735600000,7562
28,1374735600000,656
32,1374735600000,3960
// Performance Analysis
// Find the line count
$ hdfs dfs -cat hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=COMPLETE/000000_0 | wc -l
22899
// Find the row count against source table which is not a partitioned one
hive> select count(1) from orders where order_status ='COMPLETE';
OK
22899
MapReduce Jobs Launched:
Stage-Stage-1: Map: 1 Reduce: 1 Cumulative CPU: 5.95 sec HDFS Read: 45693 HDFS Write: 6 SUCCESS
Total MapReduce CPU Time Spent: 5 seconds 950 msec ---> 6 SECONDS
// Find the row count of a partitioned table
hive> select count(1) from orders_partitioned where status ='COMPLETE';
MapReduce Jobs Launched:
Stage-Stage-1: Map: 1 Reduce: 1 Cumulative CPU: 3.38 sec HDFS Read: 579428 HDFS Write: 6 SUCCESS
Total MapReduce CPU Time Spent: 3 seconds 380 msec ----> 3.4 SECONDS
hive> show partitions ohm.orders_partitioned;
OK
status=CANCELED
status=CLOSED
status=COMPLETE
status=ON_HOLD
status=PAYMENT_REVIEW
status=PENDING
status=PENDING_PAYMENT
status=PROCESSING
status=SUSPECTED_FRAUD
Time taken: 0.113 seconds, Fetched: 9 row(s)
No comments:
Post a Comment