Sankara's Big Data Notes: hive

Showing posts with label hive. Show all posts

Tuesday, 11 August 2020

Read, Write CSV files using Spark

scala> val df = spark.sql("select * from ohm.orders")

df: org.apache.spark.sql.DataFrame = [order_id: int, order_date: bigint ... 2 more fields]

scala> df.printSchema()

root

|-- order_id: integer (nullable = true)

|-- order_date: long (nullable = true)

|-- order_customer_id: integer (nullable = true)

|-- order_status: string (nullable = true)

scala> val dfOrders = df.withColumnRenamed("order_id","id").withColumnRenamed("order_date","dt").withColumnRenamed("order_customer_id","cust_id").withColumnRenamed("order_status","status")

dfOrders: org.apache.spark.sql.DataFrame = [id: int, dt: bigint ... 2 more fields]

scala> dfOrders.printSchema()

root

|-- id: integer (nullable = true)

|-- dt: long (nullable = true)

|-- cust_id: integer (nullable = true)

|-- status: string (nullable = true)

scala> df.show(5)

+--------+-------------+-----------------+---------------+

+--------+-------------+-----------------+---------------+

| 1|1374735600000| 11599| CLOSED|

| 2|1374735600000| 256|PENDING_PAYMENT|

| 3|1374735600000| 12111| COMPLETE|

| 4|1374735600000| 8827| CLOSED|

| 5|1374735600000| 11318| COMPLETE|

+--------+-------------+-----------------+---------------+

// Dataframe into CSV

scala> df.write.format("csv").save("/home/cloudera/sparkoutput/orders_csv")

// display the csv file content in terminal - top 10 lines

$ cat /home/cloudera/sparkoutput/orders_csv/part-00000-1f2f2adb-849e-49c0-bdef-8706cf1bcfb9-c000.csv | head

1,1374735600000,11599,CLOSED

2,1374735600000,256,PENDING_PAYMENT

3,1374735600000,12111,COMPLETE

4,1374735600000,8827,CLOSED

5,1374735600000,11318,COMPLETE

6,1374735600000,7130,COMPLETE

7,1374735600000,4530,COMPLETE

8,1374735600000,2911,PROCESSING

9,1374735600000,5657,PENDING_PAYMENT

10,1374735600000,5648,PENDING_PAYMENT

// Column Renaming

scala> val df = dfOrdersCSV.withColumnRenamed("_c0","order_id").withColumnRenamed("_c1","dt").withColumnRenamed("_c2","cust_id").withColumnRenamed("_c3","status")

df: org.apache.spark.sql.DataFrame = [order_id: int, dt: bigint ... 2 more fields]

scala> df.printSchema()

root

|-- order_id: integer (nullable = true)

|-- dt: long (nullable = true)

|-- cust_id: integer (nullable = true)

|-- status: string (nullable = true)

scala> df.show(5)

+--------+-------------+-------+---------------+

+--------+-------------+-------+---------------+

| 1|1374735600000| 11599| CLOSED|

| 2|1374735600000| 256|PENDING_PAYMENT|

| 3|1374735600000| 12111| COMPLETE|

| 4|1374735600000| 8827| CLOSED|

| 5|1374735600000| 11318| COMPLETE|

+--------+-------------+-------+---------------+

only showing top 5 rows

// Read CSV file using Spark -- schema not inferred here - so, we will be getting string for all the fields

scala> val dfOrdersCSV = spark.read.format("csv").load("/home/cloudera/sparkoutput/orders_csv/")

dfOrdersCSV: org.apache.spark.sql.DataFrame = [_c0: string, _c1: string ... 2 more fields]

scala> dfOrdersCSV.printSchema()

root

|-- _c0: string (nullable = true)

|-- _c1: string (nullable = true)

|-- _c2: string (nullable = true)

|-- _c3: string (nullable = true)

// Here we do inferSchema - so corresponding data types applied - but we have not got headers

scala> val dfOrdersCSV = spark.read.format("csv").option("inferSchema","True").load("/home/cloudera/sparkoutput/orders_csv/")

dfOrdersCSV: org.apache.spark.sql.DataFrame = [_c0: int, _c1: bigint ... 2 more fields]

scala> dfOrdersCSV.printSchema()

root

|-- _c0: integer (nullable = true)

|-- _c1: long (nullable = true)

|-- _c2: integer (nullable = true)

|-- _c3: string (nullable = true)

// We applied headers option then also we didnt get headers - because no headers present in that CSV

scala> val dfOrdersCSV = spark.read.format("csv").option("inferSchema","True").option("headers","true").load("/home/cloudera/sparkoutput/orders_csv/")

dfOrdersCSV: org.apache.spark.sql.DataFrame = [_c0: int, _c1: bigint ... 2 more fields]

scala> dfOrdersCSV.printSchema()

root

|-- _c0: integer (nullable = true)

|-- _c1: long (nullable = true)

|-- _c2: integer (nullable = true)

|-- _c3: string (nullable = true)

scala> val df = dfOrdersCSV.withColumnRenamed("_c0","order_id").withColumnRenamed("_c1","dt").withColumnRenamed("_c2","cust_id").withColumnRenamed("_c3","status")

df: org.apache.spark.sql.DataFrame = [order_id: int, dt: bigint ... 2 more fields]

scala> df.printSchema()

root

|-- order_id: integer (nullable = true)

|-- dt: long (nullable = true)

|-- cust_id: integer (nullable = true)

|-- status: string (nullable = true)

scala> df.show(5)

+--------+-------------+-------+---------------+

+--------+-------------+-------+---------------+

| 1|1374735600000| 11599| CLOSED|

| 2|1374735600000| 256|PENDING_PAYMENT|

| 3|1374735600000| 12111| COMPLETE|

| 4|1374735600000| 8827| CLOSED|

| 5|1374735600000| 11318| COMPLETE|

+--------+-------------+-------+---------------+

only showing top 5 rows

Read Hive table in Spark using Spark SQL | Read, Write JSON

// create a Dataframe for ohm.customers in Hive

scala> val dfCust = spark.sql("select * from ohm.customers")

dfCust: org.apache.spark.sql.DataFrame = [customer_id: int, customer_fname: string ... 7 more fields]

scala> dfCust.printSchema()

root

|-- customer_id: integer (nullable = true)

|-- customer_fname: string (nullable = true)

|-- customer_lname: string (nullable = true)

|-- customer_email: string (nullable = true)

|-- customer_password: string (nullable = true)

|-- customer_street: string (nullable = true)

|-- customer_city: string (nullable = true)

|-- customer_state: string (nullable = true)

|-- customer_zipcode: string (nullable = true)

// Rename the columns

scala> val df = dfCust.withColumnRenamed("customer_id","id").withColumnRenamed("customer_fname","fname").withColumnRenamed("customer_lname","lname").withColumnRenamed("customer_email","email").withColumnRenamed("customer_password","password").withColumnRenamed("customer_street","street").withColumnRenamed("customer_city","city").withColumnRenamed("customer_state","state").withColumnRenamed("customer_zipcode","zipcode")

df: org.apache.spark.sql.DataFrame = [id: int, fname: string ... 7 more fields]

scala> df.printSchema()

root

|-- id: integer (nullable = true)

|-- fname: string (nullable = true)

|-- lname: string (nullable = true)

|-- email: string (nullable = true)

|-- password: string (nullable = true)

|-- street: string (nullable = true)

|-- city: string (nullable = true)

|-- state: string (nullable = true)

|-- zipcode: string (nullable = true)

scala> df.show(10)

+---+-------+---------+---------+---------+--------------------+-----------+-----+-------+

+---+-------+---------+---------+---------+--------------------+-----------+-----+-------+

| 3| Ann| Smith|XXXXXXXXX|XXXXXXXXX|3422 Blue Pioneer...| Caguas| PR| 00725|

+---+-------+---------+---------+---------+--------------------+-----------+-----+-------+

only showing top 10 rows

// create a json file using the data taken from dataframe

scala> df.write.format("json").save("/home/cloudera/sparkoutput/cust_json")

// display 10 lines

$ head /home/cloudera/sparkoutput/cust_json/part-00000-d87c7ca5-261d-4b90-9044-3ed4513f29c8-c000.json

{"id":1,"fname":"Richard","lname":"Hernandez","email":"XXXXXXXXX","password":"XXXXXXXXX","street":"6303 Heather Plaza","city":"Brownsville","state":"TX","zipcode":"78521"}

{"id":2,"fname":"Mary","lname":"Barrett","email":"XXXXXXXXX","password":"XXXXXXXXX","street":"9526 Noble Embers Ridge","city":"Littleton","state":"CO","zipcode":"80126"}

{"id":3,"fname":"Ann","lname":"Smith","email":"XXXXXXXXX","password":"XXXXXXXXX","street":"3422 Blue Pioneer Bend","city":"Caguas","state":"PR","zipcode":"00725"}

{"id":4,"fname":"Mary","lname":"Jones","email":"XXXXXXXXX","password":"XXXXXXXXX","street":"8324 Little Common","city":"San Marcos","state":"CA","zipcode":"92069"}

{"id":5,"fname":"Robert","lname":"Hudson","email":"XXXXXXXXX","password":"XXXXXXXXX","street":"10 Crystal River Mall ","city":"Caguas","state":"PR","zipcode":"00725"}

{"id":6,"fname":"Mary","lname":"Smith","email":"XXXXXXXXX","password":"XXXXXXXXX","street":"3151 Sleepy Quail Promenade","city":"Passaic","state":"NJ","zipcode":"07055"}

{"id":7,"fname":"Melissa","lname":"Wilcox","email":"XXXXXXXXX","password":"XXXXXXXXX","street":"9453 High Concession","city":"Caguas","state":"PR","zipcode":"00725"}

{"id":8,"fname":"Megan","lname":"Smith","email":"XXXXXXXXX","password":"XXXXXXXXX","street":"3047 Foggy Forest Plaza","city":"Lawrence","state":"MA","zipcode":"01841"}

{"id":9,"fname":"Mary","lname":"Perez","email":"XXXXXXXXX","password":"XXXXXXXXX","street":"3616 Quaking Street","city":"Caguas","state":"PR","zipcode":"00725"}

{"id":10,"fname":"Melissa","lname":"Smith","email":"XXXXXXXXX","password":"XXXXXXXXX","street":"8598 Harvest Beacon Plaza","city":"Stafford","state":"VA","zipcode":"22554"}

// read json file and create a dataframe

scala> val df = spark.read.format("json").load("/home/cloudera/sparkoutput/cust_json/")

df: org.apache.spark.sql.DataFrame = [city: string, email: string ... 7 more fields]

scala> df.printSchema()

root

|-- city: string (nullable = true)

|-- email: string (nullable = true)

|-- fname: string (nullable = true)

|-- id: long (nullable = true)

|-- lname: string (nullable = true)

|-- password: string (nullable = true)

|-- state: string (nullable = true)

|-- street: string (nullable = true)

|-- zipcode: string (nullable = true)

scala> df.show(5)

+-----------+---------+-------+---+---------+---------+-----+--------------------+-------+

+-----------+---------+-------+---+---------+---------+-----+--------------------+-------+

| Caguas|XXXXXXXXX| Ann| 3| Smith|XXXXXXXXX| PR|3422 Blue Pioneer...| 00725|

+-----------+---------+-------+---+---------+---------+-----+--------------------+-------+

only showing top 5 rows

// select specific columns only

scala> df.select("id","fname","lname","city").show(5)

+---+-------+---------+-----------+

+---+-------+---------+-----------+

| 3| Ann| Smith| Caguas|

+---+-------+---------+-----------+

only showing top 5 rows

// write the data frame content into json file

scala> df.select("id","fname","lname","city").write.format("json").save("/home/cloudera/sparkoutput/custjson_4cols")

// display top 10 lines

$ head /home/cloudera/sparkoutput/custjson_4cols/part-00000-9191ead8-927e-4c45-8807-6e6ad40abb33-c000.json

{"id":1,"fname":"Richard","lname":"Hernandez","city":"Brownsville"}

{"id":2,"fname":"Mary","lname":"Barrett","city":"Littleton"}

{"id":3,"fname":"Ann","lname":"Smith","city":"Caguas"}

{"id":4,"fname":"Mary","lname":"Jones","city":"San Marcos"}

{"id":5,"fname":"Robert","lname":"Hudson","city":"Caguas"}

{"id":6,"fname":"Mary","lname":"Smith","city":"Passaic"}

{"id":7,"fname":"Melissa","lname":"Wilcox","city":"Caguas"}

{"id":8,"fname":"Megan","lname":"Smith","city":"Lawrence"}

{"id":9,"fname":"Mary","lname":"Perez","city":"Caguas"}

{"id":10,"fname":"Melissa","lname":"Smith","city":"Stafford"}

Saturday, 8 August 2020

Bucketing a table in Hive

// Here we are going to create a bucketed table and populate the data into it using orders table in Hive

CREATE EXTERNAL TABLE ohm.orders_bucketed

(

orderid int,

date string,

custid int,

status string

)

clustered by (custid) into 10 buckets

row format delimited

fields terminated by ',';

Location: hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_bucketed

// while doing partition we wont add the partition column in the structure

// But in bucketing we will be adding bucket column in the table structure

//enable the bucketing property

set hive.enforce.bucketing = true;

// populate the data taken from orders table

hive> insert overwrite table ohm.orders_bucketed select order_id, order_date, order_customer_id, order_status from ohm.orders;

number of mappers: 1; number of reducers: 10

Loading data to table ohm.orders_bucketed

Table ohm.orders_bucketed stats: [numFiles=10, numRows=68883, totalSize=2448880, rawDataSize=2379997]

hdfs dfs -ls hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_bucketed

Found 10 items

-rwxrwxrwx 1 cloudera supergroup 246834 2020-08-08 02:12 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_bucketed/000000_0

-rwxrwxrwx 1 cloudera supergroup 242162 2020-08-08 02:12 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_bucketed/000001_0

-rwxrwxrwx 1 cloudera supergroup 246462 2020-08-08 02:12 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_bucketed/000002_0

-rwxrwxrwx 1 cloudera supergroup 244385 2020-08-08 02:12 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_bucketed/000003_0

-rwxrwxrwx 1 cloudera supergroup 243974 2020-08-08 02:12 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_bucketed/000004_0

-rwxrwxrwx 1 cloudera supergroup 242629 2020-08-08 02:12 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_bucketed/000005_0

-rwxrwxrwx 1 cloudera supergroup 241932 2020-08-08 02:12 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_bucketed/000006_0

-rwxrwxrwx 1 cloudera supergroup 247100 2020-08-08 02:12 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_bucketed/000007_0

-rwxrwxrwx 1 cloudera supergroup 248135 2020-08-08 02:12 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_bucketed/000008_0

-rwxrwxrwx 1 cloudera supergroup 245267 2020-08-08 02:12 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_bucketed/000009_0

Friday, 7 August 2020

Partitioned table creation in Hive

// How to create a partitioned table in Hive?

// This one is the source table which is non partitioned

CREATE EXTERNAL TABLE orders

(

orderid INT,

date STRING,

custid INT,

status STRING

)

row format delimited

fields terminated by ","

location '/user/cloudera/orders'

hive> describe orders;

order_id int

order_date bigint

order_customer_id int

order_status string

Time taken: 0.051 seconds, Fetched: 4 row(s)

Location: hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders

// find the unique statuses

hive> select distinct order_status from orders;

CANCELED

CLOSED

COMPLETE

ON_HOLD

PAYMENT_REVIEW

PENDING

PENDING_PAYMENT

PROCESSING

SUSPECTED_FRAUD

// set the properties

set hive.exec.dynamic.partition=true;

set hive.exec.dynamic.partition.mode=nonstrict;

// Create partition table based on status - so do not include status column in the table structure

create external table orders_partitioned

(

orderid int,

date string,

custid int

)

partitioned by (status string)

row format delimited

fields terminated by ",";

# Partition Information

# col_name data_type comment

status string

Location: hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned

Table Parameters:

EXTERNAL TRUE

numPartitions 0 ---> No data available so numPartitions is zero as of now.

transient_lastDdlTime 1596822311

//Populate partitions with data taken from another table

insert overwrite table orders_partitioned

partition (status)

select order_id, order_date,order_customer_id, order_status from orders;

Loading partition {status=PENDING}

Loading partition {status=COMPLETE}

Loading partition {status=CANCELED}

Loading partition {status=CLOSED}

Loading partition {status=PROCESSING}

Loading partition {status=PAYMENT_REVIEW}

Loading partition {status=SUSPECTED_FRAUD}

Loading partition {status=ON_HOLD}

Loading partition {status=PENDING_PAYMENT}

Table Parameters:

EXTERNAL TRUE

numPartitions 9 -- We have populated the data taken from another table. so numPartitions is 9

transient_lastDdlTime 1596822311

Location: hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned

//Find the files in HDFS after populating the data :

hdfs dfs -ls hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned

Found 9 items

drwxrwxrwx - cloudera supergroup 0 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=CANCELED

drwxrwxrwx - cloudera supergroup 0 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=CLOSED

drwxrwxrwx - cloudera supergroup 0 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=COMPLETE

drwxrwxrwx - cloudera supergroup 0 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=ON_HOLD

drwxrwxrwx - cloudera supergroup 0 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=PAYMENT_REVIEW

drwxrwxrwx - cloudera supergroup 0 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=PENDING

drwxrwxrwx - cloudera supergroup 0 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=PENDING_PAYMENT

drwxrwxrwx - cloudera supergroup 0 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=PROCESSING

drwxrwxrwx - cloudera supergroup 0 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=SUSPECTED_FRAUD

hdfs dfs -ls hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=COMPLETE

Found 1 items

-rwxrwxrwx 1 cloudera supergroup 571301 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=COMPLETE/000000_0

$ hdfs dfs -cat hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=COMPLETE/000000_0 | head

3,1374735600000,12111

5,1374735600000,11318

6,1374735600000,7130

7,1374735600000,4530

15,1374735600000,2568

17,1374735600000,2667

22,1374735600000,333

26,1374735600000,7562

28,1374735600000,656

32,1374735600000,3960

// Performance Analysis

// Find the line count

$ hdfs dfs -cat hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=COMPLETE/000000_0 | wc -l

22899

// Find the row count against source table which is not a partitioned one

hive> select count(1) from orders where order_status ='COMPLETE';

22899

MapReduce Jobs Launched:

Stage-Stage-1: Map: 1 Reduce: 1 Cumulative CPU: 5.95 sec HDFS Read: 45693 HDFS Write: 6 SUCCESS

Total MapReduce CPU Time Spent: 5 seconds 950 msec ---> 6 SECONDS

// Find the row count of a partitioned table

hive> select count(1) from orders_partitioned where status ='COMPLETE';

MapReduce Jobs Launched:

Stage-Stage-1: Map: 1 Reduce: 1 Cumulative CPU: 3.38 sec HDFS Read: 579428 HDFS Write: 6 SUCCESS

Total MapReduce CPU Time Spent: 3 seconds 380 msec ----> 3.4 SECONDS

hive> show partitions ohm.orders_partitioned;

status=CANCELED

status=CLOSED

status=COMPLETE

status=ON_HOLD

status=PAYMENT_REVIEW

status=PENDING

status=PENDING_PAYMENT

status=PROCESSING

status=SUSPECTED_FRAUD

Time taken: 0.113 seconds, Fetched: 9 row(s)

String and Date Functions in Hive

// Here we are going to play with String and Date Functions in Hive

hive> create table cust as select customer_fname as firstname, customer_lname as lastname, customer_city as city from ohm.customers;

hive> select * from cust limit 10;

Richard Hernandez Brownsville

Mary Barrett Littleton

Ann Smith Caguas

Mary Jones San Marcos

Robert Hudson Caguas

Mary Smith Passaic

Melissa Wilcox Caguas

Megan Smith Lawrence

Mary Perez Caguas

Melissa Smith Stafford

hive> describe cust;

firstname string

lastname string

city string

Time taken: 0.041 seconds, Fetched: 3 row(s)

customer_city string

Time taken: 0.06 seconds, Fetched: 3 row(s)

hive> alter table cust change customer_city city string;

Time taken: 0.308 seconds

// Concat function

hive> select concat(firstname , "-", lastname, "-" , city ) from cust limit 2;

Richard-Hernandez-Brownsville

Mary-Barrett-Littleton

hive> select concat(firstname , "|", lastname, "|" , city ) from cust limit 2;

Richard|Hernandez|Brownsville

Mary|Barrett|Littleton

hive> select concat(firstname , "\t", lastname, "\t" , city ) from cust limit 2;

Richard Hernandez Brownsville

Mary Barrett Littleton

hive> select concat_ws("\t",firstname,lastname,city ) from cust limit 2;

Richard Hernandez Brownsville

Mary Barrett Littleton

Time taken: 0.138 seconds, Fetched: 2 row(s)

hive> select concat_ws("$",firstname,lastname,city ) from cust limit 2;

Richard$Hernandez$Brownsville

Mary$Barrett$Littleton

Time taken: 0.057 seconds, Fetched: 2 row(s)

hive> select concat_ws("|",firstname,lastname,city ) from cust limit 2;

Richard|Hernandez|Brownsville

Mary|Barrett|Littleton

Time taken: 0.04 seconds, Fetched: 2 row(s)

hive> select concat(firstname , "\t", lastname, "\t" , city ) from cust limit 10;

Richard Hernandez Brownsville

Mary Barrett Littleton

Ann Smith Caguas

Mary Jones San Marcos

Robert Hudson Caguas

Mary Smith Passaic

Melissa Wilcox Caguas

Megan Smith Lawrence

Mary Perez Caguas

Melissa Smith Stafford

hive> select concat(firstname , "|", lastname, "|" , city ) from cust limit 10;

Richard|Hernandez|Brownsville

Mary|Barrett|Littleton

Ann|Smith|Caguas

Mary|Jones|San Marcos

Robert|Hudson|Caguas

Mary|Smith|Passaic

Melissa|Wilcox|Caguas

Megan|Smith|Lawrence

Mary|Perez|Caguas

Melissa|Smith|Stafford

hive> select length(firstname) as Lenn , firstname from cust where length(firstname) > 8 limit 5;

11 Christopher

9 Katherine

9 Stephanie

9 Katherine

9 Alexander

hive> select length("Raja Ram Mohan Rai"), "Raja Ram Mohan Rai";

18 Raja Ram Mohan Rai

hive> select concat (length("S/o. Mahalakshmi") ,". ", "S/o. Mahalakshmi");

16. S/o. Mahalakshmi

hive> select concat_ws(":","Kadavul","Amaithu","Vaitha","Medai");

Kadavul:Amaithu:Vaitha:Medai

hive> select concat_ws("|","Kadavul","Amaithu","Vaitha","Medai");

Kadavul|Amaithu|Vaitha|Medai

hive> select concat(firstname , " " , lastname) from cust where length(firstname) > 10 limit 5;

Christopher Smith

Christopher Prince

Christopher Smith

Christopher Curtis

Christopher Johnson

Time taken: 0.044 seconds, Fetched: 5 row(s)

hive> select concat(firstname , " " , lastname) from cust where length(concat(firstname , " " , lastname)) > 20 limit 5;

Christopher Richardson

Christopher Villarreal

Christopher Alexander

Time taken: 0.094 seconds, Fetched: 3 row(s)

/// Case conversion

hive> select UPPER(firstname), LOWER(lastname), upper(city) from cust limit 2;

RICHARD hernandez BROWNSVILLE

MARY barrett LITTLETON

Time taken: 0.343 seconds, Fetched: 2 row(s)

/Padding Left, Right

hive> select lpad(firstname,12,"*"), rpad(lastname,12,"*") from cust limit 2;

*****Richard Hernandez***

********Mary Barrett*****

hive> select upper("I love India"), lower("I love India");

I LOVE INDIA i love india

Time taken: 0.031 seconds, Fetched: 1 row(s)

// Split function

hive> select split("Arun Raja Kama Raj"," ");

["Arun","Raja","Kama","Raj"]

hive> select substr("Raja Raja Chozhan",11);

Chozhan

hive> select substr(firstname,1,3), " ", firstname from cust limit 5;

Ric Richard

Mar Mary

Ann Ann

Mar Mary

Rob Robert

hive> select regexp_replace("Raja Raja Chozhan","Raja","King");

King King Chozhan

hive> select firstname ," ", regexp_replace(firstname,"Mary","Laila"), " " from cust limit 5;

Richard Richard

Mary Laila

Ann Ann

Mary Laila

Robert Robert

hive> select current_date(), year(current_date()), month(current_date()) , date(current_date()), day(current_date()) ;

2020-08-07 2020 8 2020-08-07 7

Time taken: 0.04 seconds, Fetched: 1 row(s)

hive> select current_timestamp(), hour(current_timestamp()), minute(current_timestamp()), second(current_timestamp());

2020-08-07 08:32:41.383 8 32 41

hive> select datediff(current_date(), "1976-04-20");

16180

hive> select datediff(current_date(), "2020-01-01");

219

hive> select date_add(current_date(),100), date_sub(current_date(),219);

2020-11-15 2020-01-01

hive> select months_between(current_date(),"2020-01-01");

7.19354839

hive> select add_months("2020-01-01",7);

2020-08-01

hive> select current_date(), current_timestamp();

2020-08-07 2020-08-07 08:38:28.5

hive> select current_date(), current_timestamp(), unix_timestamp();

unix_timestamp(void) is deprecated. Use current_timestamp instead.

2020-08-07 2020-08-07 08:38:53.775 1596814733

hive> select unix_timestamp(current_timestamp);

1596814766

Database, Table commands in Hive

// Create a new database in Hive

hive> create database retail_db;

// create a new table with the data taken from some other table of some other database

hive> create table retail_db.customer_temp as select customer_id as id, customer_fname as fname, customer_lname as lname from ohm.customers;

hive> use retail_db;

Time taken: 0.014 seconds

hive> show tables;

customer_temp

Time taken: 0.01 seconds, Fetched: 1 row(s)

hive> describe customer_temp;

id int

fname string

lname string

Time taken: 0.05 seconds, Fetched: 3 row(s)

hive> describe formatted customer_temp;

# col_name data_type comment

id int

fname string

lname string

# Detailed Table Information

Database: retail_db

Owner: cloudera

CreateTime: Fri Aug 07 05:44:14 PDT 2020

LastAccessTime: UNKNOWN

Protect Mode: None

Retention: 0

Location: hdfs://quickstart.cloudera:8020/user/hive/warehouse/retail_db.db/customer_temp

Table Type: MANAGED_TABLE

Table Parameters:

COLUMN_STATS_ACCURATE true

numFiles 1

numRows 12435

rawDataSize 211716

totalSize 224151

transient_lastDdlTime 1596804254

# Storage Information

SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe

InputFormat: org.apache.hadoop.mapred.TextInputFormat

OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat

Compressed: No

Num Buckets: -1

Bucket Columns: []

Sort Columns: []

Storage Desc Params:

serialization.format 1

Time taken: 0.066 seconds, Fetched: 33 row(s)

// Renaming the database is not applicable in Hive (But work around is there)

ALTER DATABASE retail_db RENAME TO retail_new; // not possible in CDH 5.13

// Rename the table

hive> alter table customer_temp rename to cust;

Time taken: 0.114 seconds

hive> show tables;

cust

Time taken: 0.018 seconds, Fetched: 1 row(s)

hive> describe cust;

id int

fname string

lname string

Time taken: 0.057 seconds, Fetched: 3 row(s)

// How to rename the column?

hive> alter table cust change id custid int;

Time taken: 0.128 seconds

hive> describe cust;

custid int

fname string

lname string

Time taken: 0.051 seconds, Fetched: 3 row(s)

// How to replace column names for set of columns?

hive> alter table cust replace columns (id int, firstname string, lastname string);

Time taken: 0.088 seconds

hive> describe cust;

id int

firstname string

lastname string

Time taken: 0.048 seconds, Fetched: 3 row(s)

// How to add new columns to the existing table?

hive> alter table cust add columns (age int, email string);

Time taken: 0.069 seconds

hive> describe cust;

id int

firstname string

lastname string

age int -------> New column

email string -------> New column

Time taken: 0.047 seconds, Fetched: 5 row(s)

// Newly created columns will be having NULL values

hive> select * from cust limit 10;

1 Richard Hernandez NULL NULL

2 Mary Barrett NULL NULL

3 Ann Smith NULL NULL

4 Mary Jones NULL NULL

5 Robert Hudson NULL NULL

6 Mary Smith NULL NULL

7 Melissa Wilcox NULL NULL

8 Megan Smith NULL NULL

9 Mary Perez NULL NULL

10 Melissa Smith NULL NULL

Time taken: 0.063 seconds, Fetched: 10 row(s)

// How to change the file format of existing table?

hive> alter table cust set fileformat parquet;

Time taken: 0.089 seconds

SerDe Library: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe

InputFormat: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat

OutputFormat: org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat

hive> alter table cust set fileformat avro;

Time taken: 0.108 seconds

hive> describe formatted cust;

SerDe Library: org.apache.hadoop.hive.serde2.avro.AvroSerDe

InputFormat: org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat

OutputFormat: org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat

Location: hdfs://quickstart.cloudera:8020/user/hive/warehouse/retail_db.db/cust

// How to change the location of a table?

hive> alter table cust set location '/user/cloudera/customer';

hive> describe formatted cust;

Location: hdfs://quickstart.cloudera:8020/user/cloudera/customer

Import all the tables from MySQL to Hive with Snappy Compression and Parquet file creation using SQOOP

mysql> use retail_db;

Reading table information for completion of table and column names

You can turn off this feature to get a quicker startup with -A

Database changed

mysql> show tables;

+---------------------+

| Tables_in_retail_db |

+---------------------+

| categories |

| customers |

| departments |

| order_items |

| orders |

| products |

+---------------------+

6 rows in set (0.00 sec)

Here we are going to import all the tables of retail_db database of MySQL into Hive.

Additional options we have used are : Snappy Compression, Save as Parquet file

// Delete the database in Hive

hive> drop database ohm cascade;

Time taken: 1.187 seconds

hive> create database ohm;

sqoop import-all-tables \

-connect jdbc:mysql://localhost:3306/retail_db \

-username root \

-password cloudera \

-warehouse-dir /user/hive/warehouse \

-m 1 \

-hive-database ohm \

-hive-import \

-hive-overwrite \

-create-hive-table \

-compress \

-compression-codec snappy \

-as-parquetfile \

-outdir java_out ;

hive> use ohm;

Time taken: 0.01 seconds

hive> show tables;

Thursday, 6 August 2020

Parse Fixed file format using Serde in Hive

How to parse fixed format text file using Regular Expression SerDe in Hive?

// we have a text file with fixed column width for each data

id,name,age columns with 5,10,2 are the length (width) of these columns.

$ hdfs dfs -cat /user/cloudera/person-fixed/p1.txt

00001Amar 28

00002Biswal 32

00003AnilKumble36

00004Bilal 53

00005KalaiSelvi32

// Here we use regular expression first 5 characters for id,

// next 10 characters for name and last 2 characters for age : "(.{5})(.{10})(.{2})")

// Specify the hdfs folder location only. do not specify the file name

create external table person_fixed

(

id int,

name string,

age int

)

ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'

with SERDEPROPERTIES("input.regex"="(.{5})(.{10})(.{2})")

location '/user/cloudera/person-fixed';

hive> select * from person_fixed;

1 Amar 28

2 Biswal 32

3 AnilKumble 36

4 Bilal 53

5 KalaiSelvi 32

Time taken: 0.045 seconds, Fetched: 5 row(s)

Parquet file creation with GZip Compression in Hive

GZip, Snappy, Uncompresed compression formats supported in Hive

set the compression properties in Hive Environment

hive> SET hive.exec.compress.output=true;

hive> SET parquet.compression=GZIP;

//parquet format with Snappy compression

create table customer_parquet_snappy

stored as parquet

location '/user/cloudera/cust_parquet_snappy'

as select * from customers;

$ hdfs dfs -ls '/user/cloudera/cust_parquet_snappy';

Found 1 items

-rwxr-xr-x 1 cloudera cloudera 166435 2020-08-06 19:18 hdfs://localhost:8020/user/cloudera/cust_parquet_snappy/000000_0

$ parquet-tools head -n1 hdfs://localhost:8020/user/cloudera/cust_parquet_snappy/000000_0

id = 1

fname = Richard

lname = Hernandez

email = XXXXXXXXX

password = XXXXXXXXX

street = 6303 Heather Plaza

city = Brownsville

state = TX

zipcode = 78521

$ parquet-tools meta hdfs://localhost:8020/user/cloudera/cust_parquet_snappy/000000_0;

creator: parquet-mr version 1.5.0-cdh5.13.0 (build ${buildNumber})

file schema: hive_schema

-------------------------------------------------------------------------------------------------------------------------------------------------------------

id: OPTIONAL INT32 R:0 D:1

fname: OPTIONAL BINARY O:UTF8 R:0 D:1

lname: OPTIONAL BINARY O:UTF8 R:0 D:1

email: OPTIONAL BINARY O:UTF8 R:0 D:1

password: OPTIONAL BINARY O:UTF8 R:0 D:1

street: OPTIONAL BINARY O:UTF8 R:0 D:1

city: OPTIONAL BINARY O:UTF8 R:0 D:1

state: OPTIONAL BINARY O:UTF8 R:0 D:1

zipcode: OPTIONAL BINARY O:UTF8 R:0 D:1

row group 1: RC:12435 TS:333854

-------------------------------------------------------------------------------------------------------------------------------------------------------------

id: INT32 GZIP DO:0 FPO:4 SZ:17317/49787/2.88 VC:12435 ENC:PLAIN,RLE,BIT_PACKED

fname: BINARY GZIP DO:0 FPO:17321 SZ:10531/14512/1.38 VC:12435 ENC:PLAIN_DICTIONARY,RLE,BIT_PACKED

lname: BINARY GZIP DO:0 FPO:27852 SZ:18157/25572/1.41 VC:12435 ENC:PLAIN_DICTIONARY,RLE,BIT_PACKED

email: BINARY GZIP DO:0 FPO:46009 SZ:114/83/0.73 VC:12435 ENC:PLAIN_DICTIONARY,RLE,BIT_PACKED

password: BINARY GZIP DO:0 FPO:46123 SZ:114/83/0.73 VC:12435 ENC:PLAIN_DICTIONARY,RLE,BIT_PACKED

street: BINARY GZIP DO:0 FPO:46237 SZ:78850/186626/2.37 VC:12435 ENC:PLAIN_DICTIONARY,RLE,BIT_PACKED

city: BINARY GZIP DO:0 FPO:125087 SZ:16495/22872/1.39 VC:12435 ENC:PLAIN_DICTIONARY,RLE,BIT_PACKED

state: BINARY GZIP DO:0 FPO:141582 SZ:7213/9677/1.34 VC:12435 ENC:PLAIN_DICTIONARY,RLE,BIT_PACKED

zipcode: BINARY GZIP DO:0 FPO:148795 SZ:16847/24642/1.46 VC:12435 ENC:PLAIN_DICTIONARY,RLE,BIT_PACKED

Create Parquet file in Hive

// Create a PARQUET file using the output of SELECT query

// output of SELECT query will be written as a parquet file in hdfs

hive> create table cust_parquet

stored as parquet

location '/user/cloudera/cust_parquet'

as SELECT * FROM CUSTOMERS;

hive> show tables;

cust_parquet

customers

hive> describe formatted cust_parquet;

# col_name data_type comment

id int

fname string

lname string

email string

password string

street string

city string

state string

zipcode string

# Detailed Table Information

Database: ohm

Owner: cloudera

CreateTime: Thu Aug 06 19:04:16 PDT 2020

LastAccessTime: UNKNOWN

Protect Mode: None

Retention: 0

Location: hdfs://quickstart.cloudera:8020/user/cloudera/cust_parquet

Table Type: MANAGED_TABLE

Table Parameters:

COLUMN_STATS_ACCURATE true

numFiles 1

numRows 12435

rawDataSize 111915

totalSize 334655

transient_lastDdlTime 1596765856

# Storage Information

SerDe Library: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe

InputFormat: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat

OutputFormat: org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat

Compressed: No

Num Buckets: -1

Bucket Columns: []

Sort Columns: []

Storage Desc Params:

serialization.format 1

Time taken: 0.091 seconds, Fetched: 39 row(s)

// Display the parquet file in hdfs:

[cloudera@quickstart ~]$ hdfs dfs -ls hdfs://localhost:8020/user/cloudera/cust_parquet

Found 1 items

-rwxr-xr-x 1 cloudera cloudera 334655 2020-08-06 19:04 hdfs://localhost:8020/user/cloudera/cust_parquet/000000_0

// Display the content of Parquet file using Parquet-tools

[cloudera@quickstart ~]$ parquet-tools head -n3 hdfs://localhost:8020/user/cloudera/cust_parquet/000000_0

id = 1

fname = Richard

lname = Hernandez

email = XXXXXXXXX

password = XXXXXXXXX

street = 6303 Heather Plaza

city = Brownsville

state = TX

zipcode = 78521

id = 2

fname = Mary

lname = Barrett

email = XXXXXXXXX

password = XXXXXXXXX

street = 9526 Noble Embers Ridge

city = Littleton

state = CO

zipcode = 80126

id = 3

fname = Ann

lname = Smith

email = XXXXXXXXX

password = XXXXXXXXX

street = 3422 Blue Pioneer Bend

city = Caguas

state = PR

zipcode = 00725

Hive Aggregation Functions

//Aggregation functions in Hive

hive> select city,count(1) as cnt from customers group by city order by cnt desc limit 10;

Caguas 4584

Chicago 274

Brooklyn 225

Los Angeles 224

New York 120

Philadelphia 105

Bronx 105

San Diego 104

Houston 91

Miami 87

hive> select city,count(1) as cnt from customers group by city having cnt >= 50 order by cnt desc ;

Caguas 4584

Chicago 274

Brooklyn 225

Los Angeles 224

New York 120

Bronx 105

Philadelphia 105

San Diego 104

Houston 91

Miami 87

Las Vegas 81

Dallas 75

San Jose 71

Aurora 64

Phoenix 64

Detroit 64

San Antonio 53

Lancaster 52

Virginia Beach 50

select min(id), max(id), sum(id), avg(id) from customers;

1 12435 77320830 6218.0

Tuesday, 4 August 2020

CTAS - Create Table As Select Example in Hive

CTAS - Create Table As Select

// We can redirect the output of a Hive statement into a brand new table using CTAS.

// Create table on the fly using the data coming from another table

hive> create table sub_maths_only as select * from ohm.subject where subject_name ='Maths';

hive> select * from sub_maths_only;

100 Maths 10

103 Maths 5

106 Maths 25

109 Maths 11

Time taken: 0.091 seconds, Fetched: 4 row(s)

hive> create table sub_science_only as select * from ohm.subject where subject_name ='Science';

hive> select * from sub_science_only;

101 Science 20

104 Science 5

107 Science 25

110 Science 21

Time taken: 0.09 seconds, Fetched: 4 row(s)

// Aggregation sum, count example

hive> select subject_name, sum(students_attended), count(students_attended) from sub_maths_only GROUP BY subject_name;

Maths 51 4

hive> select subject_name, sum(students_attended), count(students_attended) from sub_science_only GROUP BY subject_name;

Science 71 4

select subject_name, sum(students_attended), count(students_attended) from sub_maths_only GROUP BY subject_name

UNION ALL

select subject_name, sum(students_attended), count(students_attended) from sub_science_only GROUP BY subject_name;

Science 71 4

Maths 51 4

//CTAS -- Redirect the output of UNION ALL into a table

// Save the UNION ALL result in a subject_result table

create table subject_result AS

select subject_name as subject, sum(students_attended) as totalstudents from sub_maths_only GROUP BY subject_name

UNION ALL

select subject_name as subject, sum(students_attended) as totalstudents from sub_science_only GROUP BY subject_name;

hive> select * from subject_result;

Science 71

Maths 51

Time taken: 0.08 seconds, Fetched: 2 row(s)

External Table creation in Hive and loading 4 different data files located in HDFS

$ pwd

/home/cloudera/subjects

cat > list1.txt

id,subject_name,students_attended

100,Maths,10

101,Science,20

cat > list2.txt

id,subject_name,students_attended

103,Maths,5

104,Science,5

cat > list3.txt

id,subject_name,students_attended

106,Maths,25

107,Science,25

cat > list4.txt

id,subject_name,students_attended

109,Maths,11

110,Science,21

$ ls -lrt

total 16

-rw-rw-r-- 1 cloudera cloudera 79 Aug 4 05:24 list1.txt

-rw-rw-r-- 1 cloudera cloudera 77 Aug 4 05:24 list2.txt

-rw-rw-r-- 1 cloudera cloudera 79 Aug 4 05:25 list3.txt

-rw-rw-r-- 1 cloudera cloudera 79 Aug 4 05:25 list4.txt

$ cat *

id,subject_name,students_attended

100,Maths,10

101,Science,20

id,subject_name,students_attended

103,Maths,5

104,Science,5

id,subject_name,students_attended

106,Maths,25

107,Science,25

id,subject_name,students_attended

109,Maths,11

110,Science,21

$ hdfs dfs -mkdir /user/cloudera/subjects-data

$ hdfs dfs -copyFromLocal *.* /user/cloudera/subjects-data/

$ hdfs dfs -ls /user/cloudera/subjects-data/

Found 4 items

-rw-r--r-- 1 cloudera supergroup 79 2020-08-04 05:41 /user/cloudera/subjects-data/list1.txt

-rw-r--r-- 1 cloudera supergroup 77 2020-08-04 05:41 /user/cloudera/subjects-data/list2.txt

-rw-r--r-- 1 cloudera supergroup 79 2020-08-04 05:41 /user/cloudera/subjects-data/list3.txt

-rw-r--r-- 1 cloudera supergroup 79 2020-08-04 05:41 /user/cloudera/subjects-data/list4.txt

create external table ohm.subject

(

id int,

subject_name string,

students_attended int

)

row format delimited

fields terminated by ",";

LOCATION

hive> LOAD DATA LOCAL INPATH '/home/cloudera/subjects/' INTO TABLE ohm.subject;

Loading data to table ohm.subject

Table ohm.subject stats: [numFiles=4, totalSize=314]

Time taken: 0.97 second

// Here it shows headers in between because each file has its own header

hive> select * from ohm.subject;

NULL subject_name NULL

100 Maths 10

101 Science 20

NULL subject_name NULL

103 Maths 5

104 Science 5

NULL subject_name NULL

106 Maths 25

107 Science 25

NULL subject_name NULL

109 Maths 11

110 Science 21

Time taken: 0.511 seconds, Fetched: 16 row(s)

// droping the table and going to re-create it

hive> drop table ohm.subject;

Time taken: 0.189 seconds

// skipping the header lines

// LOCATION specified in hdfs path

create external table ohm.subject

(

id int,

subject_name string,

students_attended int

)

row format delimited

fields terminated by ","

STORED AS TEXTFILE

LOCATION '/user/cloudera/subjects-data/'

TBLPROPERTIES("skip.header.line.count"="1");

hive> select * from ohm.subject;

100 Maths 10

101 Science 20

103 Maths 5

104 Science 5

106 Maths 25

107 Science 25

109 Maths 11

110 Science 21

Time taken: 0.096 seconds, Fetched: 8 row(s)

'//Aggregation operation in Hive

hive> select subject_name,sum(students_attended) from ohm.subject group by subject_name;

Maths 51

Science 71

hive> select subject_name,sum(students_attended), count(subject_name) from ohm.subject group by subject_name;

Maths 51 4

Science 71 4

External table creation in Hive

// Have a data file in local linux

$ cat /home/cloudera/customers/customers.txt

id|fname|lname|city

100|Raja|Ravindar|Chennai

101|Suresh|Rahul|Trichy

102|Arun|Murali|Madurai

103|Kali|Das|Peravurani

104|Kalai|Selvi|Pattukottai

$ hdfs dfs -mkdir /user/cloudera/customer-ext/

// copy the local file into hdfs

$ hdfs dfs -copyFromLocal /home/cloudera/customers/customers.txt /user/cloudera/customer-ext/

// display the hdfs file

hdfs dfs -cat /user/cloudera/customer-ext/customers.txt

id|fname|lname|city

100|Raja|Ravindar|Chennai

101|Suresh|Rahul|Trichy

102|Arun|Murali|Madurai

103|Kali|Das|Peravurani

104|Kalai|Selvi|Pattukottai

hive> use ohm;

Time taken: 0.022 seconds

// create external table with location specified for the data file in hdfs

// exclude header line

create external table ohm.customers

(

id int,

fname string,

lname string,

city string

)

row format delimited

fields terminated by '|'

stored as textfile

location '/user/cloudera/customer-ext/'

tblproperties ("skip.header.line.count"="1");

hive> select * from ohm.customers;

100 Raja Ravindar Chennai

101 Suresh Rahul Trichy

102 Arun Murali Madurai

103 Kali Das Peravurani

104 Kalai Selvi Pattukottai

hive> describe formatted ohm.customers;

# col_name data_type comment

id int

fname string

lname string

city string

# Detailed Table Information

Database: ohm

Owner: cloudera

CreateTime: Tue Aug 04 04:07:35 PDT 2020

LastAccessTime: UNKNOWN

Protect Mode: None

Retention: 0

Location: hdfs://quickstart.cloudera:8020/user/cloudera/customer-ext

Table Type: EXTERNAL_TABLE

Table Parameters:

COLUMN_STATS_ACCURATE false

EXTERNAL TRUE

numFiles 1

numRows -1

rawDataSize -1

skip.header.line.count 1

totalSize 146

transient_lastDdlTime 1596539255

# Storage Information

SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe

InputFormat: org.apache.hadoop.mapred.TextInputFormat

OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat

Compressed: No

Num Buckets: -1

Bucket Columns: []

Sort Columns: []

Storage Desc Params:

field.delim |

serialization.format |

Time taken: 0.051 seconds, Fetched: 37 row(s)

// Removing the table in Hive (External table)

hive> drop table ohm.customers;

Time taken: 0.661 seconds

// Even if we drop the table in Hive, only the metastore will be deleted

// Data file will not be deleted

$ hdfs dfs -ls /user/cloudera/customer-ext

Found 1 items

-rw-r--r-- 1 cloudera supergroup 146 2020-08-04 04:05 /user/cloudera/customer-ext/customers.txt