Wednesday, 15 June 2016

Get CSV data from SFTP and create Hive Table using Python script

Content of Expect Script

#!/usr/bin/expect
cd /home/admin123/SFTP/sftpdata
spawn sftp root@ftp.xxxxx.com
expect "password:"
send "ihCtwxxxxxxxxxxxxq\n"
expect "sftp>"
send "get /root/FRED-GGGDTPBRA188N.csv\n"
expect "sftp>"
send "exit\n"
expect ":~$"
spawn hadoop fs -mkdir -p /ftpdata/testdirdump
expect ":~$"
spawn hadoop fs -copyFromLocal -f /home/admin123/SFTP/sftpdata/FRED-GGGDTPBRA188N.csv /ftpdata/testdirdump
expect ":~$"
spawn python /home/admin123/scripts/GenerateHiveScript.py /home/admin123/SFTP/sftpdata/FRED-GGGDTPBRA188N.csv sftptestdata sftpFRED /ftpdata/testdirdump /home/admin123/SFTP/sftphivequery/hivequery
expect ":~$"
spawn sudo chmod -R 777 /home/admin123/SFTP/sftphivequery/hivequerysftpFRED.hql
expect ":~$"
spawn hive -f /home/admin123/SFTP/sftphivequery/hivequerysftpFRED.hql
expect eof
catch wait result


Contents of GenerateHiveScript.py 

'''
Created on 09-May-2016

@author: Lokesh.Nanda
'''

#import string
import re
#import time
import sys
import csv


source_path = sys.argv[1]
#F:\StockStreamingData\NSEStockData.20160428-131159.csv
database_name = sys.argv[2]
database_name_mod = re.sub('[^a-zA-Z0-9 \n\.]', '_', database_name)
#ftpdata
table_name = sys.argv[3]
table_name_mod = re.sub('[^a-zA-Z0-9 \n\.]', '_', table_name)
#testtable
hdfs_file_loc = sys.argv[4]
#/datapull/stocks
hive_query_path = sys.argv[5]
#F:\StockStreamingData\hivequery

with open(source_path, 'r') as f:
#with open('/home/admin123/NSEStockData.20160505-060003.csv', 'r') as f:
#with open('C:\Users\lokesh.nanda\Desktop\MOCK_DATA.csv', 'r') as f:
        first_line = f.readline()
        flag = False
        symbol = "~`!@#$%^&*()-+={}[]:>;'</?*-+"
        for i in first_line:
            if i in symbol:
                flag = True
                break
       
        #a = ","
        #count_comma = first_line.count(a)
        #first_line = csv.reader(first_line)
        #count_comma = first_line.count

for item in csv.reader([first_line], skipinitialspace=True):
   count_comma = len(item)

       
        finaldump = "create database if not exists " + database_name_mod + "; use " + database_name_mod + "; create external table " + table_name_mod +"("
        if(flag):
            i = 0
            while(i <= count_comma):
                finaldump = finaldump + "col"+ str(i) + " String,"
                i = i+1
        else:
            columns = first_line.split(',')
   #columns = csv.reader(first_line)
            for col in columns:
                finaldump = finaldump + "`" + col + "` String,"
        finaldump = finaldump[:-1] + ") ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LOCATION '" +hdfs_file_loc+ "';"

with open(hive_query_path + table_name + '.hql', 'w') as the_file:

    the_file.write(finaldump)

No comments:

Post a Comment