Monday, 3 August 2020

Importing data from MySQL to HDFS using SQOOP based on conditions Where, Query, Columns options

Import data from MySQL to HDFS using SQOOP with conditional data importing

//Conditional import using Where  

sqoop import \
-connect jdbc:mysql://localhost/retail_db \
-username root \
-password cloudera \
-table customers \
-target-dir /user/cloudera/cust_nm_mary \
-delete-target-dir \
-where 'customer_fname="Mary"'

Retrieved 4741 records.

hdfs dfs -ls /user/cloudera/cust_nm_mary 
Found 5 items
-rw-r--r--   1 cloudera cloudera          0 2020-08-03 00:23 /user/cloudera/cust_nm_mary/_SUCCESS
-rw-r--r--   1 cloudera cloudera      89617 2020-08-03 00:23 /user/cloudera/cust_nm_mary/part-m-00000
-rw-r--r--   1 cloudera cloudera      88116 2020-08-03 00:23 /user/cloudera/cust_nm_mary/part-m-00001
-rw-r--r--   1 cloudera cloudera      89590 2020-08-03 00:23 /user/cloudera/cust_nm_mary/part-m-00002
-rw-r--r--   1 cloudera cloudera      90219 2020-08-03 00:23 /user/cloudera/cust_nm_mary/part-m-00003

// Look here : All the records have Mary but, here we have selected all the columns
hdfs dfs -cat /user/cloudera/cust_nm_mary/part-m-00000 | head
2,Mary,Barrett,XXXXXXXXX,XXXXXXXXX,9526 Noble Embers Ridge,Littleton,CO,80126
4,Mary,Jones,XXXXXXXXX,XXXXXXXXX,8324 Little Common,San Marcos,CA,92069
6,Mary,Smith,XXXXXXXXX,XXXXXXXXX,3151 Sleepy Quail Promenade,Passaic,NJ,07055
9,Mary,Perez,XXXXXXXXX,XXXXXXXXX,3616 Quaking Street,Caguas,PR,00725
11,Mary,Huffman,XXXXXXXXX,XXXXXXXXX,3169 Stony Woods,Caguas,PR,00725
13,Mary,Baldwin,XXXXXXXXX,XXXXXXXXX,7922 Iron Oak Gardens,Caguas,PR,00725
17,Mary,Robinson,XXXXXXXXX,XXXXXXXXX,1325 Noble Pike,Taylor,MI,48180
20,Mary,Ellis,XXXXXXXXX,XXXXXXXXX,4703 Old Route,West New York,NJ,07093
24,Mary,Smith,XXXXXXXXX,XXXXXXXXX,9417 Emerald Towers,Caguas,PR,00725
27,Mary,Vincent,XXXXXXXXX,XXXXXXXXX,1768 Sleepy Zephyr Place,Caguas,PR,00725


// Retrieve data of specific columns only along with where condition
sqoop import \
-connect jdbc:mysql://localhost/retail_db \
-username root \
-password cloudera \
-table customers \
-target-dir /user/cloudera/cust_col_sel \
-delete-target-dir \
-columns "customer_fname, customer_lname, customer_city" \
-where 'customer_fname="Mary"'


 hdfs dfs -ls /user/cloudera/cust_col_sel
Found 5 items
-rw-r--r--   1 cloudera cloudera          0 2020-08-03 00:43 /user/cloudera/cust_col_sel/_SUCCESS
-rw-r--r--   1 cloudera cloudera      24361 2020-08-03 00:43 /user/cloudera/cust_col_sel/part-m-00000
-rw-r--r--   1 cloudera cloudera      23816 2020-08-03 00:43 /user/cloudera/cust_col_sel/part-m-00001
-rw-r--r--   1 cloudera cloudera      24280 2020-08-03 00:43 /user/cloudera/cust_col_sel/part-m-00002
-rw-r--r--   1 cloudera cloudera      24254 2020-08-03 00:43 /user/cloudera/cust_col_sel/part-m-00003

// Retrieved First Name, Last name, city only
$ hdfs dfs -cat /user/cloudera/cust_col_sel/part-m-00000 | head
Mary,Barrett,Littleton
Mary,Jones,San Marcos
Mary,Smith,Passaic
Mary,Perez,Caguas
Mary,Huffman,Caguas
Mary,Baldwin,Caguas
Mary,Robinson,Taylor
Mary,Ellis,West New York
Mary,Smith,Caguas
Mary,Vincent,Caguas


mysql> select customer_fname, customer_lname, customer_city from retail_db.customers where customer_id between 100 and 110;
+----------------+----------------+-------------------+
| customer_fname | customer_lname | customer_city     |
+----------------+----------------+-------------------+
| George         | Barrett        | Caguas            |
| Mary           | Briggs         | Dorchester Center |
| Gregory        | Smith          | Caguas            |
| Mary           | Williams       | Broomfield        |
| Debra          | Wood           | Saint Paul        |
| Mary           | Smith          | Caguas            |
| Lauren         | Freeman        | Napa              |
| Mary           | Weaver         | Caguas            |
| Joshua         | Smith          | Bronx             |
| Mary           | Thompson       | Caguas            |
| David          | Miranda        | Beaverton         |
+----------------+----------------+-------------------+

// Cannot specify --query and --table together.
//AND \$CONDITIONS must be included as part of the SELECT query
// Retrieve data based on the select query
sqoop import \
-connect jdbc:mysql://localhost/retail_db \
-username root \
-password cloudera \
-target-dir /user/cloudera/cust_qry_rstl \
-delete-target-dir \
-query "select customer_id,customer_fname, customer_lname, customer_city from retail_db.customers where customer_id between 100 and 110 AND \$CONDITIONS" \
-split-by  customer_id



hdfs dfs -ls /user/cloudera/cust_qry_rstl
Found 5 items
-rw-r--r--   1 cloudera cloudera          0 2020-08-03 01:11 /user/cloudera/cust_qry_rstl/_SUCCESS
-rw-r--r--   1 cloudera cloudera         85 2020-08-03 01:11 /user/cloudera/cust_qry_rstl/part-m-00000
-rw-r--r--   1 cloudera cloudera         77 2020-08-03 01:11 /user/cloudera/cust_qry_rstl/part-m-00001
-rw-r--r--   1 cloudera cloudera         47 2020-08-03 01:11 /user/cloudera/cust_qry_rstl/part-m-00002
-rw-r--r--   1 cloudera cloudera         76 2020-08-03 01:11 /user/cloudera/cust_qry_rstl/part-m-00003

// display the data written in HDFS
$ hdfs dfs -cat /user/cloudera/cust_qry_rstl/* 
100,George,Barrett,Caguas
101,Mary,Briggs,Dorchester Center
102,Gregory,Smith,Caguas
103,Mary,Williams,Broomfield
104,Debra,Wood,Saint Paul
105,Mary,Smith,Caguas
106,Lauren,Freeman,Napa
107,Mary,Weaver,Caguas
108,Joshua,Smith,Bronx
109,Mary,Thompson,Caguas
110,David,Miranda,Beaverton

No comments:

Post a Comment

Flume - Simple Demo

// create a folder in hdfs : $ hdfs dfs -mkdir /user/flumeExa // Create a shell script which generates : Hadoop in real world <n>...