Wednesday, 26 December 2018

CSV to MySQL to Hive via SQOOP import

// Logon to MySQL
[cloudera@quickstart ~]$ mysql -uroot -pcloudera

// Create database and Table
mysql> create database TRVL_DB;
Query OK, 1 row affected (0.06 sec)

mysql> use TRVL_DB;
Database changed

// #1. MySQL Table Creating Script
CREATE TABLE SALES_TRN(
ActivityID VARCHAR(100),
DateTimeUTC VARCHAR(20),
DateTime1 VARCHAR(100),
Activity VARCHAR(100),
Organisation VARCHAR(100),
User_Name VARCHAR(100),
User_Email VARCHAR(100),
User_ID VARCHAR(100),
Phone_Number VARCHAR(100),
Reference VARCHAR(100),
Ticket_VendorID VARCHAR(100),
Ticket_DigiticketID VARCHAR(100),
Ticket_FlashType VARCHAR(100),
Ticket_Expiry VARCHAR(20),
Ticket_TTL VARCHAR(100),
Ticket_UpdateResolution VARCHAR(100),
InitProductType VARCHAR(100),
Price DOUBLE,
Ticket_RoundExpiryDownToTime VARCHAR(100),
Ticket_ProductID VARCHAR(100),
Ticket_DeviceID VARCHAR(100),
User_Agent VARCHAR(100),
Ticket_FirstFetched VARCHAR(20),
Ticket_Activated VARCHAR(20),
Ticket_LastConfirmedExpiry VARCHAR(20),
Ticket_Category VARCHAR(100),
Ticket_BackgroundColor VARCHAR(100),
Ticket_ForegroundColor VARCHAR(100),
Ticket_TTLUnits VARCHAR(100),
Ticket_ExpiryUpdated VARCHAR(20),
Ticket_InitialExpiry VARCHAR(20),
Ticket_ExpiryReason VARCHAR(100),
Ticket_ForceExpiredBy VARCHAR(100),
Ticket_ExpiryReasonDetail VARCHAR(100),
Price_Waived VARCHAR(100),
Ticket_UnlockedReason VARCHAR(100),
Ticket_UnlockedBy VARCHAR(100),
Ticket_LockedVersionID VARCHAR(100),
Price_Carnet DOUBLE,
Ticket_Instance VARCHAR(100),
Ticket_InstanceParent VARCHAR(100),
Ticket_CompedBy VARCHAR(100),
Ticket_CompedReason VARCHAR(100),
Ticket_ForceExpiredReason VARCHAR(100),
Ticket_AutoActivateDate VARCHAR(20),
Ticket_AutoActivate VARCHAR(100),
Ticket_AutoActivated VARCHAR(100),
DeviceID VARCHAR(100),
RemoteIP VARCHAR(100),
DisableUpdate VARCHAR(100));



// create a data loader shell script which loads multiple data file content into MySQL DB
// #2. MySQL data load script
cat > myshellscript.sh
for i in *.csv
do
  echo "file=$i"
  mysql -e "LOAD DATA LOCAL INFILE '"/home/cloudera/Desktop/POCs/Travel_Analysis/$i"' INTO TABLE TRVL_DB.SALES_TRN COLUMNS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '""' ESCAPED BY '""' LINES TERMINATED BY '\n' IGNORE 1 LINES;" -u root --password=cloudera TRVL_DB
done
^C

// Give all permissions to myshellscript file
[cloudera@quickstart Travel_Analysis]$ sudo chmod 777 myshellscript.sh

// execute it in shell
[cloudera@quickstart Travel_Analysis]$ sh myshellscript.sh


// verify the data loaded into MySQL
mysql> select * from SALES_TRN;
24089 rows in set (0.16 sec)


mysql> select User_ID, count(ActivityID), sum(price) from SALES_TRN group by User_ID limit 10;
+--------------------------------------+-------------------+------------+
| User_ID                              | count(ActivityID) | sum(price) |
+--------------------------------------+-------------------+------------+
| 5c4cdfdb-303f-4cf5-9546-f31af0c1f6a2 |                21 |       2530 |
| 78bceeb8-f371-45c8-9fe5-0787628aae34 |               142 |      23460 |
| 994acdec-e15c-44ce-b48a-c5fa5b2b6465 |               187 |      30360 |
| AACMCwBhvma                          |                 2 |          0 |
| AACNw.Yt9b5uGA                       |                38 |       1840 |
| AAWRgrIGvaW                          |                 2 |          0 |
| ACFzjdJaopu                          |                 3 |          0 |
| AcKIHRhmQFu                          |                20 |          0 |
| AejMSFMapLm                          |                40 |          0 |
| AeQMfuwtYpu                          |                 1 |          0 |
+--------------------------------------+-------------------+------------+
10 rows in set (0.08 sec)



//Go to Hive
[cloudera@quickstart ~]$ hive

Logging initialized using configuration in file:/etc/hive/conf.dist/hive-log4j.properties
WARNING: Hive CLI is deprecated and migration to Beeline is recommended.

hive> show databases;
OK
clickstream
default
weblog
Time taken: 0.851 seconds, Fetched: 3 row(s)

// Here we don't have table named : sales_trn in default db
// Create it first
hive> create database TRVL_DB;
OK
Time taken: 0.088 seconds


// run the following sqoop import which imports sales_trn table from TRVL_DB of MySQL into Hive
//#3. Sqoop Import Script
[cloudera@quickstart ~]$ sqoop import --connect jdbc:mysql://localhost/TRVL_DB --username root --password cloudera -m 1 --table SALES_TRN  --fields-terminated-by "," --target-dir sales_trn --hive-import --create-hive-table --hive-table TRVL_DB.sales_trn


hive> use TRVL_DB;
OK
Time taken: 0.059 seconds

hive> show tables;
OK
sales_trn
Time taken: 0.056 seconds, Fetched: 1 row(s)


hive> select * from sales_trn limit 10;
OK
------------------
------------------

Flume - Simple Demo

// create a folder in hdfs : $ hdfs dfs -mkdir /user/flumeExa // Create a shell script which generates : Hadoop in real world <n>...