Showing posts with label hive. Show all posts
Showing posts with label hive. Show all posts

Tuesday, 11 August 2020

Read, Write CSV files using Spark

scala> val df = spark.sql("select * from ohm.orders")
df: org.apache.spark.sql.DataFrame = [order_id: int, order_date: bigint ... 2 more fields]

scala> df.printSchema()
root
 |-- order_id: integer (nullable = true)
 |-- order_date: long (nullable = true)
 |-- order_customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)
 

scala> val dfOrders = df.withColumnRenamed("order_id","id").withColumnRenamed("order_date","dt").withColumnRenamed("order_customer_id","cust_id").withColumnRenamed("order_status","status")
dfOrders: org.apache.spark.sql.DataFrame = [id: int, dt: bigint ... 2 more fields]

scala> dfOrders.printSchema()
root
 |-- id: integer (nullable = true)
 |-- dt: long (nullable = true)
 |-- cust_id: integer (nullable = true)
 |-- status: string (nullable = true)

scala> df.show(5)
+--------+-------------+-----------------+---------------+
|order_id|   order_date|order_customer_id|   order_status|
+--------+-------------+-----------------+---------------+
|       1|1374735600000|            11599|         CLOSED|
|       2|1374735600000|              256|PENDING_PAYMENT|
|       3|1374735600000|            12111|       COMPLETE|
|       4|1374735600000|             8827|         CLOSED|
|       5|1374735600000|            11318|       COMPLETE|
+--------+-------------+-----------------+---------------+

// Dataframe into CSV
scala> df.write.format("csv").save("/home/cloudera/sparkoutput/orders_csv")

// display the csv file content in terminal - top 10 lines
$ cat /home/cloudera/sparkoutput/orders_csv/part-00000-1f2f2adb-849e-49c0-bdef-8706cf1bcfb9-c000.csv | head
1,1374735600000,11599,CLOSED
2,1374735600000,256,PENDING_PAYMENT
3,1374735600000,12111,COMPLETE
4,1374735600000,8827,CLOSED
5,1374735600000,11318,COMPLETE
6,1374735600000,7130,COMPLETE
7,1374735600000,4530,COMPLETE
8,1374735600000,2911,PROCESSING
9,1374735600000,5657,PENDING_PAYMENT
10,1374735600000,5648,PENDING_PAYMENT


// Column Renaming
scala> val df = dfOrdersCSV.withColumnRenamed("_c0","order_id").withColumnRenamed("_c1","dt").withColumnRenamed("_c2","cust_id").withColumnRenamed("_c3","status")
df: org.apache.spark.sql.DataFrame = [order_id: int, dt: bigint ... 2 more fields]

scala> df.printSchema()
root
 |-- order_id: integer (nullable = true)
 |-- dt: long (nullable = true)
 |-- cust_id: integer (nullable = true)
 |-- status: string (nullable = true)


scala> df.show(5)
+--------+-------------+-------+---------------+
|order_id|           dt|cust_id|         status|
+--------+-------------+-------+---------------+
|       1|1374735600000|  11599|         CLOSED|
|       2|1374735600000|    256|PENDING_PAYMENT|
|       3|1374735600000|  12111|       COMPLETE|
|       4|1374735600000|   8827|         CLOSED|
|       5|1374735600000|  11318|       COMPLETE|
+--------+-------------+-------+---------------+
only showing top 5 rows


// Read CSV file using Spark -- schema not inferred here - so, we will be getting  string for all the fields
scala> val dfOrdersCSV = spark.read.format("csv").load("/home/cloudera/sparkoutput/orders_csv/")
dfOrdersCSV: org.apache.spark.sql.DataFrame = [_c0: string, _c1: string ... 2 more fields]

scala> dfOrdersCSV.printSchema()
root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)


// Here we do inferSchema - so corresponding data types applied - but we have not got headers
scala> val dfOrdersCSV = spark.read.format("csv").option("inferSchema","True").load("/home/cloudera/sparkoutput/orders_csv/")
dfOrdersCSV: org.apache.spark.sql.DataFrame = [_c0: int, _c1: bigint ... 2 more fields]

scala> dfOrdersCSV.printSchema()
root
 |-- _c0: integer (nullable = true)
 |-- _c1: long (nullable = true)
 |-- _c2: integer (nullable = true)
 |-- _c3: string (nullable = true)


// We applied headers option then also we didnt get headers - because no headers present in that CSV
scala> val dfOrdersCSV = spark.read.format("csv").option("inferSchema","True").option("headers","true").load("/home/cloudera/sparkoutput/orders_csv/")
dfOrdersCSV: org.apache.spark.sql.DataFrame = [_c0: int, _c1: bigint ... 2 more fields]

scala> dfOrdersCSV.printSchema()
root
 |-- _c0: integer (nullable = true)
 |-- _c1: long (nullable = true)
 |-- _c2: integer (nullable = true)
 |-- _c3: string (nullable = true)
 
 
 scala> val df = dfOrdersCSV.withColumnRenamed("_c0","order_id").withColumnRenamed("_c1","dt").withColumnRenamed("_c2","cust_id").withColumnRenamed("_c3","status")
df: org.apache.spark.sql.DataFrame = [order_id: int, dt: bigint ... 2 more fields]

scala> df.printSchema()
root
 |-- order_id: integer (nullable = true)
 |-- dt: long (nullable = true)
 |-- cust_id: integer (nullable = true)
 |-- status: string (nullable = true)


scala> df.show(5)
+--------+-------------+-------+---------------+
|order_id|           dt|cust_id|         status|
+--------+-------------+-------+---------------+
|       1|1374735600000|  11599|         CLOSED|
|       2|1374735600000|    256|PENDING_PAYMENT|
|       3|1374735600000|  12111|       COMPLETE|
|       4|1374735600000|   8827|         CLOSED|
|       5|1374735600000|  11318|       COMPLETE|
+--------+-------------+-------+---------------+
only showing top 5 rows

 
 



Read Hive table in Spark using Spark SQL | Read, Write JSON

 Read Hive table in Spark using Spark SQL | Read, Write JSON 

// create a Dataframe for ohm.customers in Hive
scala> val dfCust = spark.sql("select * from ohm.customers")
dfCust: org.apache.spark.sql.DataFrame = [customer_id: int, customer_fname: string ... 7 more fields]

scala> dfCust.printSchema()
root
 |-- customer_id: integer (nullable = true)
 |-- customer_fname: string (nullable = true)
 |-- customer_lname: string (nullable = true)
 |-- customer_email: string (nullable = true)
 |-- customer_password: string (nullable = true)
 |-- customer_street: string (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)
 |-- customer_zipcode: string (nullable = true)

// Rename the columns
scala> val df = dfCust.withColumnRenamed("customer_id","id").withColumnRenamed("customer_fname","fname").withColumnRenamed("customer_lname","lname").withColumnRenamed("customer_email","email").withColumnRenamed("customer_password","password").withColumnRenamed("customer_street","street").withColumnRenamed("customer_city","city").withColumnRenamed("customer_state","state").withColumnRenamed("customer_zipcode","zipcode")
df: org.apache.spark.sql.DataFrame = [id: int, fname: string ... 7 more fields]

scala> df.printSchema()
root
 |-- id: integer (nullable = true)
 |-- fname: string (nullable = true)
 |-- lname: string (nullable = true)
 |-- email: string (nullable = true)
 |-- password: string (nullable = true)
 |-- street: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- zipcode: string (nullable = true)

scala> df.show(10)
+---+-------+---------+---------+---------+--------------------+-----------+-----+-------+
| id|  fname|    lname|    email| password|              street|       city|state|zipcode|
+---+-------+---------+---------+---------+--------------------+-----------+-----+-------+
|  1|Richard|Hernandez|XXXXXXXXX|XXXXXXXXX|  6303 Heather Plaza|Brownsville|   TX|  78521|
|  2|   Mary|  Barrett|XXXXXXXXX|XXXXXXXXX|9526 Noble Embers...|  Littleton|   CO|  80126|
|  3|    Ann|    Smith|XXXXXXXXX|XXXXXXXXX|3422 Blue Pioneer...|     Caguas|   PR|  00725|
|  4|   Mary|    Jones|XXXXXXXXX|XXXXXXXXX|  8324 Little Common| San Marcos|   CA|  92069|
|  5| Robert|   Hudson|XXXXXXXXX|XXXXXXXXX|10 Crystal River ...|     Caguas|   PR|  00725|
|  6|   Mary|    Smith|XXXXXXXXX|XXXXXXXXX|3151 Sleepy Quail...|    Passaic|   NJ|  07055|
|  7|Melissa|   Wilcox|XXXXXXXXX|XXXXXXXXX|9453 High Concession|     Caguas|   PR|  00725|
|  8|  Megan|    Smith|XXXXXXXXX|XXXXXXXXX|3047 Foggy Forest...|   Lawrence|   MA|  01841|
|  9|   Mary|    Perez|XXXXXXXXX|XXXXXXXXX| 3616 Quaking Street|     Caguas|   PR|  00725|
| 10|Melissa|    Smith|XXXXXXXXX|XXXXXXXXX|8598 Harvest Beac...|   Stafford|   VA|  22554|
+---+-------+---------+---------+---------+--------------------+-----------+-----+-------+
only showing top 10 rows

// create a json file using the data taken from dataframe
scala> df.write.format("json").save("/home/cloudera/sparkoutput/cust_json")

// display 10 lines
$ head /home/cloudera/sparkoutput/cust_json/part-00000-d87c7ca5-261d-4b90-9044-3ed4513f29c8-c000.json 

{"id":1,"fname":"Richard","lname":"Hernandez","email":"XXXXXXXXX","password":"XXXXXXXXX","street":"6303 Heather Plaza","city":"Brownsville","state":"TX","zipcode":"78521"}
{"id":2,"fname":"Mary","lname":"Barrett","email":"XXXXXXXXX","password":"XXXXXXXXX","street":"9526 Noble Embers Ridge","city":"Littleton","state":"CO","zipcode":"80126"}
{"id":3,"fname":"Ann","lname":"Smith","email":"XXXXXXXXX","password":"XXXXXXXXX","street":"3422 Blue Pioneer Bend","city":"Caguas","state":"PR","zipcode":"00725"}
{"id":4,"fname":"Mary","lname":"Jones","email":"XXXXXXXXX","password":"XXXXXXXXX","street":"8324 Little Common","city":"San Marcos","state":"CA","zipcode":"92069"}
{"id":5,"fname":"Robert","lname":"Hudson","email":"XXXXXXXXX","password":"XXXXXXXXX","street":"10 Crystal River Mall ","city":"Caguas","state":"PR","zipcode":"00725"}
{"id":6,"fname":"Mary","lname":"Smith","email":"XXXXXXXXX","password":"XXXXXXXXX","street":"3151 Sleepy Quail Promenade","city":"Passaic","state":"NJ","zipcode":"07055"}
{"id":7,"fname":"Melissa","lname":"Wilcox","email":"XXXXXXXXX","password":"XXXXXXXXX","street":"9453 High Concession","city":"Caguas","state":"PR","zipcode":"00725"}
{"id":8,"fname":"Megan","lname":"Smith","email":"XXXXXXXXX","password":"XXXXXXXXX","street":"3047 Foggy Forest Plaza","city":"Lawrence","state":"MA","zipcode":"01841"}
{"id":9,"fname":"Mary","lname":"Perez","email":"XXXXXXXXX","password":"XXXXXXXXX","street":"3616 Quaking Street","city":"Caguas","state":"PR","zipcode":"00725"}
{"id":10,"fname":"Melissa","lname":"Smith","email":"XXXXXXXXX","password":"XXXXXXXXX","street":"8598 Harvest Beacon Plaza","city":"Stafford","state":"VA","zipcode":"22554"}


// read json file and create a dataframe
scala> val df = spark.read.format("json").load("/home/cloudera/sparkoutput/cust_json/")
df: org.apache.spark.sql.DataFrame = [city: string, email: string ... 7 more fields]

scala> df.printSchema()
root
 |-- city: string (nullable = true)
 |-- email: string (nullable = true)
 |-- fname: string (nullable = true)
 |-- id: long (nullable = true)
 |-- lname: string (nullable = true)
 |-- password: string (nullable = true)
 |-- state: string (nullable = true)
 |-- street: string (nullable = true)
 |-- zipcode: string (nullable = true)


scala> df.show(5)
+-----------+---------+-------+---+---------+---------+-----+--------------------+-------+
|       city|    email|  fname| id|    lname| password|state|              street|zipcode|
+-----------+---------+-------+---+---------+---------+-----+--------------------+-------+
|Brownsville|XXXXXXXXX|Richard|  1|Hernandez|XXXXXXXXX|   TX|  6303 Heather Plaza|  78521|
|  Littleton|XXXXXXXXX|   Mary|  2|  Barrett|XXXXXXXXX|   CO|9526 Noble Embers...|  80126|
|     Caguas|XXXXXXXXX|    Ann|  3|    Smith|XXXXXXXXX|   PR|3422 Blue Pioneer...|  00725|
| San Marcos|XXXXXXXXX|   Mary|  4|    Jones|XXXXXXXXX|   CA|  8324 Little Common|  92069|
|     Caguas|XXXXXXXXX| Robert|  5|   Hudson|XXXXXXXXX|   PR|10 Crystal River ...|  00725|
+-----------+---------+-------+---+---------+---------+-----+--------------------+-------+
only showing top 5 rows



// select specific columns only
scala> df.select("id","fname","lname","city").show(5)
+---+-------+---------+-----------+
| id|  fname|    lname|       city|
+---+-------+---------+-----------+
|  1|Richard|Hernandez|Brownsville|
|  2|   Mary|  Barrett|  Littleton|
|  3|    Ann|    Smith|     Caguas|
|  4|   Mary|    Jones| San Marcos|
|  5| Robert|   Hudson|     Caguas|
+---+-------+---------+-----------+
only showing top 5 rows

// write the data frame content into json file
scala> df.select("id","fname","lname","city").write.format("json").save("/home/cloudera/sparkoutput/custjson_4cols")


// display top 10 lines 
$ head /home/cloudera/sparkoutput/custjson_4cols/part-00000-9191ead8-927e-4c45-8807-6e6ad40abb33-c000.json 
{"id":1,"fname":"Richard","lname":"Hernandez","city":"Brownsville"}
{"id":2,"fname":"Mary","lname":"Barrett","city":"Littleton"}
{"id":3,"fname":"Ann","lname":"Smith","city":"Caguas"}
{"id":4,"fname":"Mary","lname":"Jones","city":"San Marcos"}
{"id":5,"fname":"Robert","lname":"Hudson","city":"Caguas"}
{"id":6,"fname":"Mary","lname":"Smith","city":"Passaic"}
{"id":7,"fname":"Melissa","lname":"Wilcox","city":"Caguas"}
{"id":8,"fname":"Megan","lname":"Smith","city":"Lawrence"}
{"id":9,"fname":"Mary","lname":"Perez","city":"Caguas"}
{"id":10,"fname":"Melissa","lname":"Smith","city":"Stafford"}


Saturday, 8 August 2020

Bucketing a table in Hive

// Here we are going to create a bucketed table and populate the data into it using orders table in Hive

CREATE EXTERNAL TABLE ohm.orders_bucketed
(
orderid int,
date string,
custid int,
status string
)
clustered by (custid) into 10 buckets
row format delimited 
fields terminated by ',';


Location:  hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_bucketed  



// while doing partition we wont add the partition column in the structure 
// But in bucketing we will be adding bucket column in the table structure



//enable the bucketing property
set hive.enforce.bucketing = true;

// populate the data taken from orders table 
hive> insert overwrite table ohm.orders_bucketed select order_id, order_date, order_customer_id, order_status from ohm.orders;

number of mappers: 1; number of reducers: 10
Loading data to table ohm.orders_bucketed
Table ohm.orders_bucketed stats: [numFiles=10, numRows=68883, totalSize=2448880, rawDataSize=2379997]


hdfs dfs -ls hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_bucketed
Found 10 items
-rwxrwxrwx   1 cloudera supergroup     246834 2020-08-08 02:12 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_bucketed/000000_0
-rwxrwxrwx   1 cloudera supergroup     242162 2020-08-08 02:12 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_bucketed/000001_0
-rwxrwxrwx   1 cloudera supergroup     246462 2020-08-08 02:12 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_bucketed/000002_0
-rwxrwxrwx   1 cloudera supergroup     244385 2020-08-08 02:12 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_bucketed/000003_0
-rwxrwxrwx   1 cloudera supergroup     243974 2020-08-08 02:12 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_bucketed/000004_0
-rwxrwxrwx   1 cloudera supergroup     242629 2020-08-08 02:12 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_bucketed/000005_0
-rwxrwxrwx   1 cloudera supergroup     241932 2020-08-08 02:12 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_bucketed/000006_0
-rwxrwxrwx   1 cloudera supergroup     247100 2020-08-08 02:12 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_bucketed/000007_0
-rwxrwxrwx   1 cloudera supergroup     248135 2020-08-08 02:12 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_bucketed/000008_0
-rwxrwxrwx   1 cloudera supergroup     245267 2020-08-08 02:12 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_bucketed/000009_0

Friday, 7 August 2020

Partitioned table creation in Hive

// How to create a partitioned table in Hive?


// This one is the source table which is non partitioned 

CREATE EXTERNAL TABLE orders

(

orderid INT,

date STRING,

custid INT,

status STRING

)

row format delimited

fields terminated by ","

location '/user/cloudera/orders'



hive> describe orders;

OK

order_id            int                                      

order_date          bigint                                  

order_customer_id    int                                      

order_status        string                                  

Time taken: 0.051 seconds, Fetched: 4 row(s)


Location:            hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders



// find the unique statuses 

hive> select distinct order_status from orders;


CANCELED

CLOSED

COMPLETE

ON_HOLD

PAYMENT_REVIEW

PENDING

PENDING_PAYMENT

PROCESSING

SUSPECTED_FRAUD



// set the properties

set hive.exec.dynamic.partition=true;

set hive.exec.dynamic.partition.mode=nonstrict;


// Create partition table based on status - so do not include status column in the table structure

create external table orders_partitioned

(

orderid int,

date string,

custid int

)

partitioned by (status string)

row format delimited

fields terminated by ",";




# Partition Information  

# col_name            data_type            comment             

 

status              string                                  

 

Location:            hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned  


Table Parameters:  

EXTERNAL            TRUE                

numPartitions        0        ---> No data available so numPartitions is zero as of now.            

transient_lastDdlTime 1596822311  

//Populate partitions with data taken from another table


insert overwrite table orders_partitioned

partition (status)

select order_id, order_date,order_customer_id, order_status from orders;



Loading partition {status=PENDING}

Loading partition {status=COMPLETE}

Loading partition {status=CANCELED}

Loading partition {status=CLOSED}

Loading partition {status=PROCESSING}

Loading partition {status=PAYMENT_REVIEW}

Loading partition {status=SUSPECTED_FRAUD}

Loading partition {status=ON_HOLD}

Loading partition {status=PENDING_PAYMENT}

Table Parameters:  

EXTERNAL            TRUE                

numPartitions        9                   -- We have populated the data taken from another table. so numPartitions is 9

transient_lastDdlTime 1596822311  


Location:            hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned  

//Find the files in HDFS after populating the data :

hdfs dfs -ls hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned

Found 9 items

drwxrwxrwx   - cloudera supergroup          0 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=CANCELED

drwxrwxrwx   - cloudera supergroup          0 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=CLOSED

drwxrwxrwx   - cloudera supergroup          0 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=COMPLETE

drwxrwxrwx   - cloudera supergroup          0 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=ON_HOLD

drwxrwxrwx   - cloudera supergroup          0 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=PAYMENT_REVIEW

drwxrwxrwx   - cloudera supergroup          0 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=PENDING

drwxrwxrwx   - cloudera supergroup          0 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=PENDING_PAYMENT

drwxrwxrwx   - cloudera supergroup          0 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=PROCESSING

drwxrwxrwx   - cloudera supergroup          0 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=SUSPECTED_FRAUD



hdfs dfs -ls hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=COMPLETE

Found 1 items

-rwxrwxrwx   1 cloudera supergroup     571301 2020-08-07 10:49 hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=COMPLETE/000000_0


$ hdfs dfs -cat hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=COMPLETE/000000_0 | head 

3,1374735600000,12111

5,1374735600000,11318

6,1374735600000,7130

7,1374735600000,4530

15,1374735600000,2568

17,1374735600000,2667

22,1374735600000,333

26,1374735600000,7562

28,1374735600000,656

32,1374735600000,3960



// Performance Analysis 


// Find the line count

$ hdfs dfs -cat hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db/orders_partitioned/status=COMPLETE/000000_0 | wc -l

22899


// Find the row count against source table which is not a partitioned one

hive> select count(1) from orders where order_status ='COMPLETE';

OK

22899

MapReduce Jobs Launched: 

Stage-Stage-1: Map: 1  Reduce: 1   Cumulative CPU: 5.95 sec   HDFS Read: 45693 HDFS Write: 6 SUCCESS

Total MapReduce CPU Time Spent: 5 seconds 950 msec   ---> 6 SECONDS



// Find the row count of a partitioned table 

hive> select count(1) from orders_partitioned where status ='COMPLETE';


MapReduce Jobs Launched: 

Stage-Stage-1: Map: 1  Reduce: 1   Cumulative CPU: 3.38 sec   HDFS Read: 579428 HDFS Write: 6 SUCCESS

Total MapReduce CPU Time Spent: 3 seconds 380 msec  ----> 3.4 SECONDS 



hive> show partitions ohm.orders_partitioned;

OK 

status=CANCELED

status=CLOSED

status=COMPLETE

status=ON_HOLD

status=PAYMENT_REVIEW

status=PENDING

status=PENDING_PAYMENT

status=PROCESSING

status=SUSPECTED_FRAUD

Time taken: 0.113 seconds, Fetched: 9 row(s)





String and Date Functions in Hive

// Here we are going to play with String and Date Functions in Hive


 hive> create table cust as select customer_fname  as firstname, customer_lname as lastname, customer_city  as city from ohm.customers;


hive> select * from cust limit 10;

OK

Richard Hernandez Brownsville

Mary Barrett Littleton

Ann Smith Caguas

Mary Jones San Marcos

Robert Hudson Caguas

Mary Smith Passaic

Melissa Wilcox Caguas

Megan Smith Lawrence

Mary Perez Caguas

Melissa Smith Stafford


hive> describe cust;

OK

firstname            string                                  

lastname            string                                  

city                string                                  

Time taken: 0.041 seconds, Fetched: 3 row(s)

                                

customer_city        string                                  

Time taken: 0.06 seconds, Fetched: 3 row(s)



hive> alter table cust change customer_city city string;

OK

Time taken: 0.308 seconds



// Concat function

hive> select concat(firstname , "-",   lastname, "-" , city ) from cust limit 2;

OK

Richard-Hernandez-Brownsville

Mary-Barrett-Littleton

 

hive> select concat(firstname , "|",   lastname, "|" , city ) from cust limit 2;

OK

Richard|Hernandez|Brownsville

Mary|Barrett|Littleton



hive> select concat(firstname , "\t",   lastname, "\t" , city ) from cust limit 2;

OK

Richard Hernandez Brownsville

Mary Barrett Littleton



hive> select concat_ws("\t",firstname,lastname,city ) from cust limit 2;

OK

Richard Hernandez Brownsville

Mary Barrett Littleton

Time taken: 0.138 seconds, Fetched: 2 row(s)

hive> select concat_ws("$",firstname,lastname,city ) from cust limit 2;

OK

Richard$Hernandez$Brownsville

Mary$Barrett$Littleton

Time taken: 0.057 seconds, Fetched: 2 row(s)

hive> select concat_ws("|",firstname,lastname,city ) from cust limit 2;

OK

Richard|Hernandez|Brownsville

Mary|Barrett|Littleton

Time taken: 0.04 seconds, Fetched: 2 row(s)



hive> select concat(firstname , "\t",   lastname, "\t" , city ) from cust limit 10;

OK

Richard Hernandez Brownsville

Mary Barrett Littleton

Ann Smith Caguas

Mary Jones San Marcos

Robert Hudson Caguas

Mary Smith Passaic

Melissa Wilcox Caguas

Megan Smith Lawrence

Mary Perez Caguas

Melissa Smith Stafford


 

hive> select concat(firstname , "|",   lastname, "|" , city ) from cust limit 10;

OK

Richard|Hernandez|Brownsville

Mary|Barrett|Littleton

Ann|Smith|Caguas

Mary|Jones|San Marcos

Robert|Hudson|Caguas

Mary|Smith|Passaic

Melissa|Wilcox|Caguas

Megan|Smith|Lawrence

Mary|Perez|Caguas

Melissa|Smith|Stafford

 


hive> select length(firstname) as Lenn , firstname from cust where length(firstname)  > 8 limit 5;

OK

11 Christopher

9 Katherine

9 Stephanie

9 Katherine

9 Alexander



hive> select length("Raja Ram Mohan Rai"), "Raja Ram Mohan Rai";

OK

18 Raja Ram Mohan Rai


hive> select concat (length("S/o. Mahalakshmi") ,".  ", "S/o. Mahalakshmi");

OK

16.  S/o. Mahalakshmi


hive> select concat_ws(":","Kadavul","Amaithu","Vaitha","Medai");

OK

Kadavul:Amaithu:Vaitha:Medai

 

hive> select concat_ws("|","Kadavul","Amaithu","Vaitha","Medai");

OK

Kadavul|Amaithu|Vaitha|Medai




hive> select concat(firstname , " " , lastname) from cust where length(firstname) > 10 limit 5;

OK

Christopher Smith

Christopher Prince

Christopher Smith

Christopher Curtis

Christopher Johnson

Time taken: 0.044 seconds, Fetched: 5 row(s)



hive> select concat(firstname , " " , lastname) from cust where length(concat(firstname , " " , lastname)) > 20 limit 5;

OK

Christopher Richardson

Christopher Villarreal

Christopher Alexander

Time taken: 0.094 seconds, Fetched: 3 row(s)


/// Case conversion

hive> select UPPER(firstname), LOWER(lastname), upper(city)  from cust limit 2;

OK

RICHARD hernandez BROWNSVILLE

MARY barrett LITTLETON

Time taken: 0.343 seconds, Fetched: 2 row(s)


/Padding Left, Right

hive> select lpad(firstname,12,"*"), rpad(lastname,12,"*") from cust limit 2;

OK

*****Richard Hernandez***

********Mary Barrett*****



hive> select upper("I love India"), lower("I love India");

OK

I LOVE INDIA i love india

Time taken: 0.031 seconds, Fetched: 1 row(s)


// Split function

hive> select split("Arun Raja Kama Raj"," "); 

OK

["Arun","Raja","Kama","Raj"]



hive> select substr("Raja Raja Chozhan",11);

OK

Chozhan



hive> select substr(firstname,1,3), " ", firstname from cust limit 5;

OK

Ric Richard

Mar Mary

Ann Ann

Mar Mary

Rob Robert




hive> select regexp_replace("Raja Raja Chozhan","Raja","King");

OK

King King Chozhan



hive> select  firstname ," ", regexp_replace(firstname,"Mary","Laila"), " " from cust limit 5;

OK

Richard Richard  

Mary Laila  

Ann Ann  

Mary Laila  

Robert Robert  



hive> select current_date(), year(current_date()), month(current_date()) , date(current_date()), day(current_date()) ;

OK

2020-08-07 2020 8 2020-08-07 7

Time taken: 0.04 seconds, Fetched: 1 row(s)



hive> select current_timestamp(), hour(current_timestamp()), minute(current_timestamp()), second(current_timestamp());

OK

2020-08-07 08:32:41.383 8 32 41



hive> select datediff(current_date(), "1976-04-20");

OK

16180



hive> select datediff(current_date(), "2020-01-01");

OK

219



hive> select date_add(current_date(),100), date_sub(current_date(),219);

OK

2020-11-15 2020-01-01



hive> select months_between(current_date(),"2020-01-01");

OK

7.19354839



hive> select add_months("2020-01-01",7);

OK

2020-08-01



hive> select current_date(), current_timestamp();

OK

2020-08-07 2020-08-07 08:38:28.5



hive> select current_date(), current_timestamp(), unix_timestamp();

unix_timestamp(void) is deprecated. Use current_timestamp instead.

OK

2020-08-07 2020-08-07 08:38:53.775 1596814733



hive> select unix_timestamp(current_timestamp);

OK

1596814766


Database, Table commands in Hive


// Create a new database in Hive 

hive> create database retail_db;



// create a new table with the data taken from some other table of some other database

hive> create table retail_db.customer_temp as select customer_id as id, customer_fname as fname, customer_lname as lname from ohm.customers;


hive> use retail_db;

OK

Time taken: 0.014 seconds



hive> show tables;

OK

customer_temp

Time taken: 0.01 seconds, Fetched: 1 row(s)





hive> describe customer_temp;

OK

id                  int                                      

fname                string                                  

lname                string                                  

Time taken: 0.05 seconds, Fetched: 3 row(s)

hive> describe formatted customer_temp;

OK

# col_name            data_type            comment             

 

id                  int                                      

fname                string                                  

lname                string                                  

 

# Detailed Table Information  

Database:            retail_db             

Owner:              cloudera             

CreateTime:          Fri Aug 07 05:44:14 PDT 2020  

LastAccessTime:      UNKNOWN               

Protect Mode:        None                 

Retention:          0                     

Location:            hdfs://quickstart.cloudera:8020/user/hive/warehouse/retail_db.db/customer_temp  

Table Type:          MANAGED_TABLE         

Table Parameters:  

COLUMN_STATS_ACCURATE true                

numFiles            1                   

numRows              12435               

rawDataSize          211716              

totalSize            224151              

transient_lastDdlTime 1596804254          

 

# Storage Information  

SerDe Library:      org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe  

InputFormat:        org.apache.hadoop.mapred.TextInputFormat  

OutputFormat:        org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat  

Compressed:          No                   

Num Buckets:        -1                   

Bucket Columns:      []                   

Sort Columns:        []                   

Storage Desc Params:  

serialization.format 1                   

Time taken: 0.066 seconds, Fetched: 33 row(s)



// Renaming the database is not applicable in Hive (But work around is there)

ALTER DATABASE retail_db RENAME TO retail_new;  // not possible in CDH 5.13


// Rename the table 

hive> alter table customer_temp rename to cust;

OK

Time taken: 0.114 seconds



hive> show tables;

OK

cust

Time taken: 0.018 seconds, Fetched: 1 row(s)



hive> describe cust;

OK

id                  int                                      

fname                string                                  

lname                string                                  

Time taken: 0.057 seconds, Fetched: 3 row(s)


// How to rename the column?

hive> alter table cust change id custid int;

OK

Time taken: 0.128 seconds



hive> describe cust;

OK

custid              int                                      

fname                string                                  

lname                string                                  

Time taken: 0.051 seconds, Fetched: 3 row(s)


// How to replace column names for set of columns?


hive> alter table cust replace columns (id int, firstname string, lastname string);

OK

Time taken: 0.088 seconds




hive> describe cust;

OK

id                  int                                      

firstname            string                                  

lastname            string                                  

Time taken: 0.048 seconds, Fetched: 3 row(s)



// How to add new columns to the existing table?


hive> alter table cust add columns (age int, email string);

OK

Time taken: 0.069 seconds



hive> describe cust;

OK

id                  int                                      

firstname            string                                  

lastname            string                                  

age                  int      -------> New column                                  

email                string       -------> New column                                      

Time taken: 0.047 seconds, Fetched: 5 row(s)


// Newly created columns will be having NULL values

hive> select * from cust limit 10;



1 Richard Hernandez NULL NULL

2 Mary Barrett NULL NULL

3 Ann Smith NULL NULL

4 Mary Jones NULL NULL

5 Robert Hudson NULL NULL

6 Mary Smith NULL NULL

7 Melissa Wilcox NULL NULL

8 Megan Smith NULL NULL

9 Mary Perez NULL NULL

10 Melissa Smith NULL NULL

Time taken: 0.063 seconds, Fetched: 10 row(s)



// How to change the file format of existing table?

hive> alter table cust set fileformat parquet;

OK

Time taken: 0.089 seconds


SerDe Library:      org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe  

InputFormat:        org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat  

OutputFormat:        org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat  



hive> alter table cust set fileformat avro;

OK

Time taken: 0.108 seconds


hive> describe formatted cust;


SerDe Library:      org.apache.hadoop.hive.serde2.avro.AvroSerDe  

InputFormat:        org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat  

OutputFormat:        org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat


Location:            hdfs://quickstart.cloudera:8020/user/hive/warehouse/retail_db.db/cust




// How to change the location of a table?


hive> alter table cust set location '/user/cloudera/customer';

OK


hive> describe formatted cust;


Location:            hdfs://quickstart.cloudera:8020/user/cloudera/customer


Import all the tables from MySQL to Hive with Snappy Compression and Parquet file creation using SQOOP

mysql> use retail_db;
Reading table information for completion of table and column names
You can turn off this feature to get a quicker startup with -A

Database changed
mysql> show tables;
+---------------------+
| Tables_in_retail_db |
+---------------------+
| categories          |
| customers           |
| departments         |
| order_items         |
| orders              |
| products            |
+---------------------+
6 rows in set (0.00 sec)


Here we are going to import all the tables of retail_db database of MySQL into Hive.

Additional options we have used are : Snappy Compression, Save as Parquet file 


// Delete the database in Hive

hive> drop database ohm cascade;

OK

Time taken: 1.187 seconds

 

 

hive> create database ohm;

OK



sqoop import-all-tables \

-connect jdbc:mysql://localhost:3306/retail_db \

-username root \

-password cloudera \

-warehouse-dir /user/hive/warehouse \

-m 1 \

-hive-database ohm \

-hive-import \

-hive-overwrite \

-create-hive-table \

-compress \

-compression-codec snappy \

-as-parquetfile \

-outdir java_out ;





hive> use ohm;

OK

Time taken: 0.01 seconds 

hive> show tables;

OK

categories

customers

departments

order_items

orders

products

Time taken: 0.029 seconds, Fetched: 6 row(s)



hive> describe database ohm;

OK

ohm hdfs://quickstart.cloudera:8020/user/hive/warehouse/ohm.db cloudera USER

Time taken: 0.011 seconds, Fetched: 1 row(s)




The following is the way to import all the tables one by one

sqoop import \
-connect "jdbc:mysql://localhost:3306/retail_db" \
-username root \
-password cloudera \
-table categories \
-target-dir /user/cloudera/retail_db\
-hive-import \
-create-hive-table \
-hive-database ohm \
-hive-table categories;


sqoop import \
-connect "jdbc:mysql://localhost:3306/retail_db" \
-username root \
-password cloudera \
-table customers \
-target-dir /user/cloudera/retail_db\
-hive-import \
-create-hive-table \
-hive-database ohm \
-hive-table customers;



sqoop import \
-connect "jdbc:mysql://localhost:3306/retail_db" \
-username root \
-password cloudera \
-table departments \
-target-dir /user/cloudera/retail_db/dept \
-hive-import \
-create-hive-table \
-hive-database ohm \
-hive-table departments;



sqoop import \
-connect "jdbc:mysql://localhost:3306/retail_db" \
-username root \
-password cloudera \
-table order_items \
-target-dir /user/cloudera/retail_db/order_items \
-hive-import \
-create-hive-table \
-hive-database ohm \
-hive-table order_items;



sqoop import \
-connect "jdbc:mysql://localhost:3306/retail_db" \
-username root \
-password cloudera \
-table orders \
-target-dir /user/cloudera/retail_db/orders \
-hive-import \
-create-hive-table \
-hive-database ohm \
-hive-table orders;


sqoop import \
-connect "jdbc:mysql://localhost:3306/retail_db" \
-username root \
-password cloudera \
-table products \
-target-dir /user/cloudera/retail_db\
-hive-import \
-create-hive-table \
-hive-database ohm \
-hive-table products;





sqoop import-all-tables \
-connect jdbc:mysql://localhost:3306/retail_db \
-username root \
-password cloudera  
-target-dir /user/cloudera/retail_db\
-hive-import \
-create-hive-table \
-hive-database ohm \
-hive-table products;


hive> show databases;
OK
default
ohm


hive> use ohm;
OK



hive> show tables;
OK
categories
cust_parquet
customer_parquet_snappy
customers
departments
order_items
person_fixed
products


 

Thursday, 6 August 2020

Parse Fixed file format using Serde in Hive

How to parse fixed format text file using Regular Expression SerDe in Hive?

// we have a text file with fixed column width for each data

id,name,age columns with 5,10,2 are the length (width) of these columns.

$ hdfs dfs -cat /user/cloudera/person-fixed/p1.txt
00001Amar      28
00002Biswal    32
00003AnilKumble36
00004Bilal     53
00005KalaiSelvi32


// Here we use regular expression first 5 characters for id, 
// next 10 characters for name and last 2 characters for age : "(.{5})(.{10})(.{2})")
// Specify the hdfs folder location only. do not specify the file name 
create external table person_fixed
(
id int,
name string,
age int
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
with SERDEPROPERTIES("input.regex"="(.{5})(.{10})(.{2})")
location '/user/cloudera/person-fixed';

hive> select * from person_fixed;
OK
1 Amar      28
2 Biswal    32
3 AnilKumble 36
4 Bilal      53
5 KalaiSelvi 32
Time taken: 0.045 seconds, Fetched: 5 row(s)

Parquet file creation with GZip Compression in Hive

GZip, Snappy, Uncompresed compression formats supported in Hive

set the compression properties in Hive Environment

hive> SET hive.exec.compress.output=true;
hive> SET parquet.compression=GZIP;

//parquet format with Snappy compression

create table customer_parquet_snappy
stored as parquet
location '/user/cloudera/cust_parquet_snappy'
as select * from customers;

$ hdfs dfs -ls '/user/cloudera/cust_parquet_snappy';
Found 1 items
-rwxr-xr-x   1 cloudera cloudera     166435 2020-08-06 19:18 hdfs://localhost:8020/user/cloudera/cust_parquet_snappy/000000_0

$ parquet-tools head -n1 hdfs://localhost:8020/user/cloudera/cust_parquet_snappy/000000_0
id = 1
fname = Richard
lname = Hernandez
email = XXXXXXXXX
password = XXXXXXXXX
street = 6303 Heather Plaza
city = Brownsville
state = TX
zipcode = 78521


$  parquet-tools meta hdfs://localhost:8020/user/cloudera/cust_parquet_snappy/000000_0;
creator:     parquet-mr version 1.5.0-cdh5.13.0 (build ${buildNumber}) 

file schema: hive_schema 
-------------------------------------------------------------------------------------------------------------------------------------------------------------
id:          OPTIONAL INT32 R:0 D:1
fname:       OPTIONAL BINARY O:UTF8 R:0 D:1
lname:       OPTIONAL BINARY O:UTF8 R:0 D:1
email:       OPTIONAL BINARY O:UTF8 R:0 D:1
password:    OPTIONAL BINARY O:UTF8 R:0 D:1
street:      OPTIONAL BINARY O:UTF8 R:0 D:1
city:        OPTIONAL BINARY O:UTF8 R:0 D:1
state:       OPTIONAL BINARY O:UTF8 R:0 D:1
zipcode:     OPTIONAL BINARY O:UTF8 R:0 D:1

row group 1: RC:12435 TS:333854 
-------------------------------------------------------------------------------------------------------------------------------------------------------------
id:           INT32 GZIP DO:0 FPO:4 SZ:17317/49787/2.88 VC:12435 ENC:PLAIN,RLE,BIT_PACKED
fname:        BINARY GZIP DO:0 FPO:17321 SZ:10531/14512/1.38 VC:12435 ENC:PLAIN_DICTIONARY,RLE,BIT_PACKED
lname:        BINARY GZIP DO:0 FPO:27852 SZ:18157/25572/1.41 VC:12435 ENC:PLAIN_DICTIONARY,RLE,BIT_PACKED
email:        BINARY GZIP DO:0 FPO:46009 SZ:114/83/0.73 VC:12435 ENC:PLAIN_DICTIONARY,RLE,BIT_PACKED
password:     BINARY GZIP DO:0 FPO:46123 SZ:114/83/0.73 VC:12435 ENC:PLAIN_DICTIONARY,RLE,BIT_PACKED
street:       BINARY GZIP DO:0 FPO:46237 SZ:78850/186626/2.37 VC:12435 ENC:PLAIN_DICTIONARY,RLE,BIT_PACKED
city:         BINARY GZIP DO:0 FPO:125087 SZ:16495/22872/1.39 VC:12435 ENC:PLAIN_DICTIONARY,RLE,BIT_PACKED
state:        BINARY GZIP DO:0 FPO:141582 SZ:7213/9677/1.34 VC:12435 ENC:PLAIN_DICTIONARY,RLE,BIT_PACKED
zipcode:      BINARY GZIP DO:0 FPO:148795 SZ:16847/24642/1.46 VC:12435 ENC:PLAIN_DICTIONARY,RLE,BIT_PACKED


Create Parquet file in Hive

// Create a PARQUET file using the output of SELECT query


// output of SELECT query will be written as a parquet file in hdfs 
hive> create table cust_parquet 
stored as parquet
location '/user/cloudera/cust_parquet'
as SELECT * FROM CUSTOMERS;

hive> show tables;
OK
cust_parquet
customers

hive> describe formatted cust_parquet;
OK
# col_name            data_type            comment             
 
id                  int                                      
fname                string                                  
lname                string                                  
email                string                                  
password            string                                  
street              string                                  
city                string                                  
state                string                                  
zipcode              string                                  
 
# Detailed Table Information  
Database:            ohm                   
Owner:              cloudera             
CreateTime:          Thu Aug 06 19:04:16 PDT 2020  
LastAccessTime:      UNKNOWN               
Protect Mode:        None                 
Retention:          0                     
Location:            hdfs://quickstart.cloudera:8020/user/cloudera/cust_parquet  
Table Type:          MANAGED_TABLE         
Table Parameters:  
COLUMN_STATS_ACCURATE true                
numFiles            1                   
numRows              12435               
rawDataSize          111915              
totalSize            334655              
transient_lastDdlTime 1596765856          
 
# Storage Information  
SerDe Library:      org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe  
InputFormat:        org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat  
OutputFormat:        org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat  
Compressed:          No                   
Num Buckets:        -1                   
Bucket Columns:      []                   
Sort Columns:        []                   
Storage Desc Params:  
serialization.format 1                   
Time taken: 0.091 seconds, Fetched: 39 row(s)


// Display the parquet file in hdfs:

[cloudera@quickstart ~]$ hdfs dfs -ls hdfs://localhost:8020/user/cloudera/cust_parquet
Found 1 items
-rwxr-xr-x   1 cloudera cloudera     334655 2020-08-06 19:04 hdfs://localhost:8020/user/cloudera/cust_parquet/000000_0


// Display the content of Parquet file using Parquet-tools

[cloudera@quickstart ~]$ parquet-tools head -n3 hdfs://localhost:8020/user/cloudera/cust_parquet/000000_0
id = 1
fname = Richard 
lname = Hernandez
email = XXXXXXXXX
password = XXXXXXXXX
street = 6303 Heather Plaza
city = Brownsville
state = TX
zipcode = 78521

id = 2
fname = Mary
lname = Barrett
email = XXXXXXXXX
password = XXXXXXXXX
street = 9526 Noble Embers Ridge
city = Littleton
state = CO
zipcode = 80126

id = 3
fname = Ann
lname = Smith
email = XXXXXXXXX
password = XXXXXXXXX
street = 3422 Blue Pioneer Bend
city = Caguas
state = PR
zipcode = 00725

Hive Aggregation Functions

//Aggregation functions in Hive

hive> select city,count(1) as cnt from customers group by city order by cnt desc limit 10;

Caguas 4584
Chicago 274
Brooklyn 225
Los Angeles 224
New York 120
Philadelphia 105
Bronx 105
San Diego 104
Houston 91
Miami 87


hive> select city,count(1) as cnt from customers group by city having cnt >= 50 order by cnt desc ;

Caguas 4584
Chicago 274
Brooklyn 225
Los Angeles 224
New York 120
Bronx 105
Philadelphia 105
San Diego 104
Houston 91
Miami 87
Las Vegas 81
Dallas 75
San Jose 71
Aurora 64
Phoenix 64
Detroit 64
San Antonio 53
Lancaster 52
Virginia Beach 50

select min(id), max(id), sum(id), avg(id) from customers;

1 12435 77320830 6218.0

Tuesday, 4 August 2020

CTAS - Create Table As Select Example in Hive

CTAS - Create Table As Select
// We can redirect the output of a Hive statement into a brand new table using CTAS.

// Create table on the fly using the data coming from another table
hive> create table sub_maths_only as select * from ohm.subject where subject_name ='Maths';

hive> select * from sub_maths_only;
OK
100 Maths 10
103 Maths 5
106 Maths 25
109 Maths 11
Time taken: 0.091 seconds, Fetched: 4 row(s) 

hive> create table sub_science_only as select * from ohm.subject where subject_name ='Science';

hive> select * from sub_science_only;
OK
101 Science 20
104 Science 5
107 Science 25
110 Science 21
Time taken: 0.09 seconds, Fetched: 4 row(s)

// Aggregation sum, count example
hive> select subject_name, sum(students_attended), count(students_attended) from sub_maths_only GROUP BY subject_name;

Maths 51 4


hive> select subject_name, sum(students_attended), count(students_attended) from sub_science_only GROUP BY subject_name;

Science 71 4


select subject_name, sum(students_attended), count(students_attended) from sub_maths_only GROUP BY subject_name 
UNION ALL
select subject_name, sum(students_attended), count(students_attended) from sub_science_only GROUP BY subject_name;


Science 71 4
Maths 51 4

//CTAS -- Redirect the output of UNION ALL into a table
// Save the UNION ALL result in a subject_result table 
create table subject_result AS
select subject_name as subject, sum(students_attended) as totalstudents from sub_maths_only GROUP BY subject_name 
UNION ALL
select subject_name  as subject, sum(students_attended) as totalstudents from sub_science_only GROUP BY subject_name;

hive> select * from subject_result;
OK
Science 71
Maths 51
Time taken: 0.08 seconds, Fetched: 2 row(s)

External Table creation in Hive and loading 4 different data files located in HDFS

$ pwd
/home/cloudera/subjects

cat > list1.txt
id,subject_name,students_attended
100,Maths,10
101,Science,20

cat > list2.txt
id,subject_name,students_attended
103,Maths,5
104,Science,5

cat > list3.txt
id,subject_name,students_attended
106,Maths,25
107,Science,25

cat > list4.txt
id,subject_name,students_attended
109,Maths,11
110,Science,21

$ ls -lrt
total 16
-rw-rw-r-- 1 cloudera cloudera 79 Aug  4 05:24 list1.txt
-rw-rw-r-- 1 cloudera cloudera 77 Aug  4 05:24 list2.txt
-rw-rw-r-- 1 cloudera cloudera 79 Aug  4 05:25 list3.txt
-rw-rw-r-- 1 cloudera cloudera 79 Aug  4 05:25 list4.txt

$ cat *
id,subject_name,students_attended
100,Maths,10
101,Science,20
id,subject_name,students_attended
103,Maths,5
104,Science,5
id,subject_name,students_attended
106,Maths,25
107,Science,25
id,subject_name,students_attended
109,Maths,11
110,Science,21



$ hdfs dfs -mkdir /user/cloudera/subjects-data
$ hdfs dfs -copyFromLocal *.* /user/cloudera/subjects-data/

$ hdfs dfs -ls /user/cloudera/subjects-data/
Found 4 items
-rw-r--r--   1 cloudera supergroup         79 2020-08-04 05:41 /user/cloudera/subjects-data/list1.txt
-rw-r--r--   1 cloudera supergroup         77 2020-08-04 05:41 /user/cloudera/subjects-data/list2.txt
-rw-r--r--   1 cloudera supergroup         79 2020-08-04 05:41 /user/cloudera/subjects-data/list3.txt
-rw-r--r--   1 cloudera supergroup         79 2020-08-04 05:41 /user/cloudera/subjects-data/list4.txt




create external table ohm.subject
(
id int,
subject_name string,
students_attended int
)
row format delimited
fields terminated by ",";
LOCATION 



hive> LOAD DATA LOCAL INPATH '/home/cloudera/subjects/' INTO TABLE ohm.subject;
Loading data to table ohm.subject
Table ohm.subject stats: [numFiles=4, totalSize=314]
OK
Time taken: 0.97 second

// Here it shows headers in between because each file has its own header
hive> select * from ohm.subject;
OK
NULL subject_name NULL
100 Maths 10
101 Science 20
NULL subject_name NULL
103 Maths 5
104 Science 5
NULL subject_name NULL
106 Maths 25
107 Science 25
NULL subject_name NULL
109 Maths 11
110 Science 21
Time taken: 0.511 seconds, Fetched: 16 row(s)

// droping the table and going to re-create it
hive> drop table ohm.subject;
OK
Time taken: 0.189 seconds

// skipping the header lines
// LOCATION specified in hdfs path
create external table ohm.subject
(
id int,
subject_name string,
students_attended int
)
row format delimited
fields terminated by ","
STORED AS TEXTFILE
LOCATION '/user/cloudera/subjects-data/'
TBLPROPERTIES("skip.header.line.count"="1");


hive> select * from ohm.subject;
OK
100 Maths 10
101 Science 20
103 Maths 5
104 Science 5
106 Maths 25
107 Science 25
109 Maths 11
110 Science 21
Time taken: 0.096 seconds, Fetched: 8 row(s)


'//Aggregation operation in Hive


hive> select subject_name,sum(students_attended) from ohm.subject group by subject_name;

OK
Maths 51
Science 71



hive> select subject_name,sum(students_attended), count(subject_name) from ohm.subject group by subject_name;

Maths 51 4
Science 71 4

External table creation in Hive

// Have a data file in local linux
$ cat /home/cloudera/customers/customers.txt
id|fname|lname|city
100|Raja|Ravindar|Chennai
101|Suresh|Rahul|Trichy
102|Arun|Murali|Madurai
103|Kali|Das|Peravurani
104|Kalai|Selvi|Pattukottai

$ hdfs dfs -mkdir /user/cloudera/customer-ext/

// copy the local file into hdfs
$ hdfs dfs -copyFromLocal /home/cloudera/customers/customers.txt /user/cloudera/customer-ext/

// display the hdfs file
hdfs dfs -cat /user/cloudera/customer-ext/customers.txt
id|fname|lname|city
100|Raja|Ravindar|Chennai
101|Suresh|Rahul|Trichy
102|Arun|Murali|Madurai
103|Kali|Das|Peravurani
104|Kalai|Selvi|Pattukottai

hive> use ohm;
OK
Time taken: 0.022 seconds

// create external table with location specified for the data file in hdfs
// exclude header line
create external table ohm.customers
(
id int,
fname string,
lname string,
city string
)
row format delimited
fields terminated by '|'
stored as textfile
location '/user/cloudera/customer-ext/'
tblproperties ("skip.header.line.count"="1");


hive> select * from ohm.customers;
OK
100 Raja Ravindar Chennai
101 Suresh Rahul Trichy
102 Arun Murali Madurai
103 Kali Das Peravurani
104 Kalai Selvi Pattukottai


hive> describe formatted ohm.customers;
OK
# col_name            data_type            comment             
 
id                  int                                      
fname                string                                  
lname                string                                  
city                string                                  
 
# Detailed Table Information  
Database:            ohm                   
Owner:              cloudera             
CreateTime:          Tue Aug 04 04:07:35 PDT 2020  
LastAccessTime:      UNKNOWN               
Protect Mode:        None                 
Retention:          0                     
Location:            hdfs://quickstart.cloudera:8020/user/cloudera/customer-ext  
Table Type:          EXTERNAL_TABLE       
Table Parameters:  
COLUMN_STATS_ACCURATE false               
EXTERNAL            TRUE                
numFiles            1                   
numRows              -1                  
rawDataSize          -1                  
skip.header.line.count 1                   
totalSize            146                 
transient_lastDdlTime 1596539255          
 
# Storage Information  
SerDe Library:      org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe  
InputFormat:        org.apache.hadoop.mapred.TextInputFormat  
OutputFormat:        org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat  
Compressed:          No                   
Num Buckets:        -1                   
Bucket Columns:      []                   
Sort Columns:        []                   
Storage Desc Params:  
field.delim          |                   
serialization.format |                   
Time taken: 0.051 seconds, Fetched: 37 row(s)


// Removing the table in Hive (External table)

hive> drop table ohm.customers;
OK
Time taken: 0.661 seconds


// Even if we drop the table in Hive, only the metastore will be deleted
// Data file will not be deleted
$ hdfs dfs -ls /user/cloudera/customer-ext
Found 1 items
-rw-r--r--   1 cloudera supergroup        146 2020-08-04 04:05 /user/cloudera/customer-ext/customers.txt


Flume - Simple Demo

// create a folder in hdfs : $ hdfs dfs -mkdir /user/flumeExa // Create a shell script which generates : Hadoop in real world <n>...