Compression methods:
Gzip, Snappy, Deflate, bzip2, lz4
//Gzip Compression
sqoop import \
-connect jdbc:mysql://localhost:3306/retail_db \
-username root \
-password cloudera \
-table customers \
-target-dir /user/cloudera/cust_gzip \
-delete-target-dir \
-compress
hdfs dfs -ls /user/cloudera/cust_gzip
Found 5 items
-rw-r--r-- 1 cloudera cloudera 0 2020-08-02 23:06 /user/cloudera/cust_gzip/_SUCCESS
-rw-r--r-- 1 cloudera cloudera 63937 2020-08-02 23:06 /user/cloudera/cust_gzip/part-m-00000.gz
-rw-r--r-- 1 cloudera cloudera 63708 2020-08-02 23:05 /user/cloudera/cust_gzip/part-m-00001.gz
-rw-r--r-- 1 cloudera cloudera 64171 2020-08-02 23:06 /user/cloudera/cust_gzip/part-m-00002.gz
-rw-r--r-- 1 cloudera cloudera 63905 2020-08-02 23:05 /user/cloudera/cust_gzip/part-m-00003.gz
//Snappy Compression
sqoop import \
-connect jdbc:mysql://localhost:3306/retail_db \
-username root \
-password cloudera \
-table customers \
-target-dir /user/cloudera/cust_snappy \
-delete-target-dir \
-compress \
-compression-codec snappy
$ hdfs dfs -ls /user/cloudera/cust_snappy
Found 5 items
-rw-r--r-- 1 cloudera cloudera 0 2020-08-02 23:07 /user/cloudera/cust_snappy/_SUCCESS
-rw-r--r-- 1 cloudera cloudera 110165 2020-08-02 23:07 /user/cloudera/cust_snappy/part-m-00000.snappy
-rw-r--r-- 1 cloudera cloudera 109884 2020-08-02 23:07 /user/cloudera/cust_snappy/part-m-00001.snappy
-rw-r--r-- 1 cloudera cloudera 110479 2020-08-02 23:07 /user/cloudera/cust_snappy/part-m-00002.snappy
-rw-r--r-- 1 cloudera cloudera 110616 2020-08-02 23:07 /user/cloudera/cust_snappy/part-m-00003.snappy
//Deflate Compression
sqoop import \
-connect jdbc:mysql://localhost:3306/retail_db \
-username root \
-password cloudera \
-table customers \
-target-dir /user/cloudera/cust_deflate \
-delete-target-dir \
-compress \
-compression-codec deflate
hdfs dfs -ls /user/cloudera/cust_deflate
Found 5 items
-rw-r--r-- 1 cloudera cloudera 0 2020-08-02 23:16 /user/cloudera/cust_deflate/_SUCCESS
-rw-r--r-- 1 cloudera cloudera 63925 2020-08-02 23:16 /user/cloudera/cust_deflate/part-m-00000.deflate
-rw-r--r-- 1 cloudera cloudera 63696 2020-08-02 23:16 /user/cloudera/cust_deflate/part-m-00001.deflate
-rw-r--r-- 1 cloudera cloudera 64159 2020-08-02 23:16 /user/cloudera/cust_deflate/part-m-00002.deflate
-rw-r--r-- 1 cloudera cloudera 63893 2020-08-02 23:16 /user/cloudera/cust_deflate/part-m-00003.deflate
//Bzip Compression
sqoop import \
-connect jdbc:mysql://localhost:3306/retail_db \
-username root \
-password cloudera \
-table customers \
-target-dir /user/cloudera/cust_bzip \
-delete-target-dir \
-compress \
-compression-codec bzip2
hdfs dfs -ls /user/cloudera/cust_bzip
Found 5 items
-rw-r--r-- 1 cloudera cloudera 0 2020-08-02 23:09 /user/cloudera/cust_bzip/_SUCCESS
-rw-r--r-- 1 cloudera cloudera 41632 2020-08-02 23:09 /user/cloudera/cust_bzip/part-m-00000.bz2
-rw-r--r-- 1 cloudera cloudera 41703 2020-08-02 23:09 /user/cloudera/cust_bzip/part-m-00001.bz2
-rw-r--r-- 1 cloudera cloudera 41869 2020-08-02 23:09 /user/cloudera/cust_bzip/part-m-00002.bz2
-rw-r--r-- 1 cloudera cloudera 41910 2020-08-02 23:09 /user/cloudera/cust_bzip/part-m-00003.bz2
//Lz4 Compression
sqoop import \
-connect jdbc:mysql://localhost:3306/retail_db \
-username root \
-password cloudera \
-table customers \
-target-dir /user/cloudera/cust_lz4 \
-delete-target-dir \
-compress \
-compression-codec Lz4
hdfs dfs -ls /user/cloudera/cust_lz4
Found 5 items
-rw-r--r-- 1 cloudera cloudera 0 2020-08-02 23:13 /user/cloudera/cust_lz4/_SUCCESS
-rw-r--r-- 1 cloudera cloudera 97878 2020-08-02 23:12 /user/cloudera/cust_lz4/part-m-00000.lz4
-rw-r--r-- 1 cloudera cloudera 97610 2020-08-02 23:12 /user/cloudera/cust_lz4/part-m-00001.lz4
-rw-r--r-- 1 cloudera cloudera 98155 2020-08-02 23:13 /user/cloudera/cust_lz4/part-m-00002.lz4
-rw-r--r-- 1 cloudera cloudera 98349 2020-08-02 23:12 /user/cloudera/cust_lz4/part-m-00003.lz4
GZip : Good compression ratio
GZip files are not splittable
Hadoop files are broken into 128 MB of blocks
BZip :
Very good compression ratio, Very slow, CPU Intensive
Snappy :
Very fast, splittable, modest
Deflat : splittable format
: Error prone
: Not widely accepted
LZ4 : modest, fast, splittable, best choice
No comments:
Post a Comment